Repository: erikgrinaker/toydb Branch: main Commit: 473afbdb4aea Files: 284 Total size: 1.4 MB Directory structure: gitextract_nc06cv1f/ ├── .github/ │ └── workflows/ │ └── ci.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md ├── cluster/ │ ├── run.sh │ ├── toydb1/ │ │ └── toydb.yaml │ ├── toydb2/ │ │ └── toydb.yaml │ ├── toydb3/ │ │ └── toydb.yaml │ ├── toydb4/ │ │ └── toydb.yaml │ └── toydb5/ │ └── toydb.yaml ├── config/ │ └── toydb.yaml ├── docs/ │ ├── architecture/ │ │ ├── README.md │ │ ├── client.md │ │ ├── encoding.md │ │ ├── index.md │ │ ├── mvcc.md │ │ ├── overview.md │ │ ├── raft.md │ │ ├── server.md │ │ ├── sql-data.md │ │ ├── sql-execution.md │ │ ├── sql-optimizer.md │ │ ├── sql-parser.md │ │ ├── sql-planner.md │ │ ├── sql-raft.md │ │ ├── sql-storage.md │ │ ├── sql.md │ │ └── storage.md │ ├── architecture.md │ ├── crate/ │ │ ├── Cargo.toml │ │ ├── README.md │ │ └── src/ │ │ └── lib.rs │ ├── examples.md │ ├── references.md │ ├── sql.md │ └── tools/ │ └── update-links.py ├── rust-toolchain ├── rustfmt.toml ├── src/ │ ├── bin/ │ │ ├── toydb.rs │ │ ├── toydump.rs │ │ ├── toysql.rs │ │ └── workload.rs │ ├── client.rs │ ├── encoding/ │ │ ├── bincode.rs │ │ ├── format.rs │ │ ├── keycode.rs │ │ └── mod.rs │ ├── error.rs │ ├── lib.rs │ ├── raft/ │ │ ├── log.rs │ │ ├── message.rs │ │ ├── mod.rs │ │ ├── node.rs │ │ ├── state.rs │ │ └── testscripts/ │ │ ├── log/ │ │ │ ├── append │ │ │ ├── commit │ │ │ ├── get │ │ │ ├── has │ │ │ ├── init │ │ │ ├── scan │ │ │ ├── scan_apply │ │ │ ├── splice │ │ │ ├── status │ │ │ └── term │ │ └── node/ │ │ ├── append │ │ ├── append_base_missing │ │ ├── append_base_missing_all │ │ ├── append_commit_quorum │ │ ├── append_initial │ │ ├── append_max_entries │ │ ├── append_pipeline │ │ ├── append_probe_divergent_first │ │ ├── append_probe_divergent_long │ │ ├── append_probe_divergent_short │ │ ├── append_probe_divergent_single │ │ ├── append_response_beyond_last_index_panics │ │ ├── append_response_stale_reject │ │ ├── election │ │ ├── election_candidate_behind_leader │ │ ├── election_candidate_behind_quorum │ │ ├── election_contested │ │ ├── election_tie │ │ ├── election_tie_even │ │ ├── heartbeat_commits_follower │ │ ├── heartbeat_converts_candidate │ │ ├── heartbeat_converts_follower │ │ ├── heartbeat_converts_follower_leaderless │ │ ├── heartbeat_converts_leader │ │ ├── heartbeat_lost_append_duplicate │ │ ├── heartbeat_lost_append_multiple │ │ ├── heartbeat_lost_append_single │ │ ├── heartbeat_lost_read │ │ ├── heartbeat_match_commits │ │ ├── heartbeat_multiple_leaders_panic │ │ ├── heartbeat_old_commit_index │ │ ├── heartbeat_old_last_index │ │ ├── heartbeat_probe_divergent │ │ ├── old_campaign_rejected │ │ ├── old_campaign_response_ignored │ │ ├── old_heartbeat_ignored │ │ ├── request_candidate_abort │ │ ├── request_follower │ │ ├── request_follower_campaign_abort │ │ ├── request_follower_disconnect_stall │ │ ├── request_follower_leaderless_abort │ │ ├── request_leader │ │ ├── request_leader_campaign_abort │ │ ├── request_leader_change_linearizability │ │ ├── request_leader_disconnect │ │ ├── request_leader_read_quorum │ │ ├── request_leader_read_quorum_sequence │ │ ├── request_leader_single │ │ ├── request_status │ │ ├── request_status_single │ │ ├── restart │ │ ├── restart_apply │ │ ├── restart_commit_recover │ │ ├── restart_term_vote │ │ ├── tick_candidate │ │ ├── tick_follower │ │ ├── tick_follower_leaderless │ │ └── tick_leader │ ├── server.rs │ ├── sql/ │ │ ├── engine/ │ │ │ ├── engine.rs │ │ │ ├── local.rs │ │ │ ├── mod.rs │ │ │ └── raft.rs │ │ ├── execution/ │ │ │ ├── aggregator.rs │ │ │ ├── executor.rs │ │ │ ├── join.rs │ │ │ ├── mod.rs │ │ │ └── session.rs │ │ ├── mod.rs │ │ ├── parser/ │ │ │ ├── ast.rs │ │ │ ├── lexer.rs │ │ │ ├── mod.rs │ │ │ └── parser.rs │ │ ├── planner/ │ │ │ ├── mod.rs │ │ │ ├── optimizer.rs │ │ │ ├── plan.rs │ │ │ └── planner.rs │ │ ├── testscripts/ │ │ │ ├── expressions/ │ │ │ │ ├── cnf │ │ │ │ ├── func │ │ │ │ ├── func_sqrt │ │ │ │ ├── literals │ │ │ │ ├── op_compare_equal │ │ │ │ ├── op_compare_greater │ │ │ │ ├── op_compare_greater_equal │ │ │ │ ├── op_compare_is_nan │ │ │ │ ├── op_compare_is_null │ │ │ │ ├── op_compare_lesser │ │ │ │ ├── op_compare_lesser_equal │ │ │ │ ├── op_compare_not_equal │ │ │ │ ├── op_logic_and │ │ │ │ ├── op_logic_not │ │ │ │ ├── op_logic_or │ │ │ │ ├── op_math_add │ │ │ │ ├── op_math_divide │ │ │ │ ├── op_math_exponentiate │ │ │ │ ├── op_math_factorial │ │ │ │ ├── op_math_identity │ │ │ │ ├── op_math_multiply │ │ │ │ ├── op_math_negate │ │ │ │ ├── op_math_remainder │ │ │ │ ├── op_math_subtract │ │ │ │ ├── op_precedence │ │ │ │ └── op_string_like │ │ │ ├── optimizers/ │ │ │ │ ├── constant_folder │ │ │ │ ├── filter_pushdown │ │ │ │ ├── hash_join │ │ │ │ ├── index_lookup │ │ │ │ └── short_circuit │ │ │ ├── queries/ │ │ │ │ ├── aggregate │ │ │ │ ├── clauses │ │ │ │ ├── group_by │ │ │ │ ├── having │ │ │ │ ├── join_cross │ │ │ │ ├── join_inner │ │ │ │ ├── join_outer │ │ │ │ ├── limit │ │ │ │ ├── offset │ │ │ │ ├── order │ │ │ │ ├── select │ │ │ │ ├── where_ │ │ │ │ ├── where_index │ │ │ │ └── where_primary_key │ │ │ ├── schema/ │ │ │ │ ├── create_table │ │ │ │ ├── create_table_datatypes │ │ │ │ ├── create_table_default │ │ │ │ ├── create_table_index │ │ │ │ ├── create_table_names │ │ │ │ ├── create_table_null │ │ │ │ ├── create_table_primary_key │ │ │ │ ├── create_table_reference │ │ │ │ ├── create_table_transaction │ │ │ │ ├── create_table_unique │ │ │ │ ├── drop_table │ │ │ │ ├── drop_table_index │ │ │ │ ├── drop_table_ref │ │ │ │ └── drop_table_transaction │ │ │ ├── transactions/ │ │ │ │ ├── anomaly_dirty_read │ │ │ │ ├── anomaly_dirty_write │ │ │ │ ├── anomaly_fuzzy_read │ │ │ │ ├── anomaly_lost_update │ │ │ │ ├── anomaly_phantom_read │ │ │ │ ├── anomaly_read_skew │ │ │ │ ├── anomaly_write_skew │ │ │ │ ├── begin │ │ │ │ ├── commit │ │ │ │ ├── isolation │ │ │ │ ├── rollback │ │ │ │ └── schema │ │ │ └── writes/ │ │ │ ├── delete │ │ │ ├── delete_index │ │ │ ├── delete_reference │ │ │ ├── delete_where │ │ │ ├── insert │ │ │ ├── insert_datatypes │ │ │ ├── insert_default │ │ │ ├── insert_index │ │ │ ├── insert_null │ │ │ ├── insert_primary_key │ │ │ ├── insert_reference │ │ │ ├── insert_unique │ │ │ ├── update │ │ │ ├── update_datatypes │ │ │ ├── update_default │ │ │ ├── update_expression │ │ │ ├── update_index │ │ │ ├── update_null │ │ │ ├── update_primary_key │ │ │ ├── update_reference │ │ │ ├── update_unique │ │ │ └── update_where │ │ └── types/ │ │ ├── expression.rs │ │ ├── mod.rs │ │ ├── schema.rs │ │ └── value.rs │ └── storage/ │ ├── bitcask.rs │ ├── engine.rs │ ├── memory.rs │ ├── mod.rs │ ├── mvcc.rs │ └── testscripts/ │ ├── bitcask/ │ │ ├── compact │ │ ├── compact_open │ │ ├── log │ │ └── status │ ├── engine/ │ │ ├── keys │ │ ├── point │ │ ├── scan │ │ └── scan_prefix │ ├── memory/ │ │ └── status │ └── mvcc/ │ ├── anomaly_dirty_read │ ├── anomaly_dirty_write │ ├── anomaly_fuzzy_read │ ├── anomaly_lost_update │ ├── anomaly_phantom_read │ ├── anomaly_read_skew │ ├── anomaly_write_skew │ ├── bank │ ├── begin │ ├── begin_as_of │ ├── begin_readonly │ ├── delete │ ├── delete_conflict │ ├── get │ ├── get_isolation │ ├── resume │ ├── rollback │ ├── scan │ ├── scan_isolation │ ├── scan_key_version_encoding │ ├── scan_prefix │ ├── set │ ├── set_conflict │ └── unversioned └── tests/ ├── scripts/ │ ├── anomalies │ ├── client │ ├── errors │ ├── isolation │ └── queries ├── testcluster.rs └── tests.rs ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/ci.yml ================================================ name: CI on: [push, pull_request, workflow_dispatch] permissions: contents: read jobs: test: name: Test runs-on: ubuntu-latest timeout-minutes: 10 steps: - uses: actions/checkout@v3 - uses: dtolnay/rust-toolchain@1.93.1 id: toolchain with: components: clippy, rustfmt - uses: actions/cache@v3 with: path: target key: ${{runner.os}}-target-${{steps.toolchain.outputs.cachekey}}-${{hashFiles('Cargo.lock')}} - run: cargo build --bins --tests - run: cargo test - run: cargo clippy --tests --no-deps -- -D warnings - run: cargo fmt --check - run: cargo doc --no-deps env: RUSTDOCFLAGS: -D warnings ================================================ FILE: .gitignore ================================================ /cluster/toydb*/data /data /docs/crate/target /target .DS_Store .vscode/ **/*.rs.bk ================================================ FILE: Cargo.toml ================================================ [package] name = "toydb" version = "1.0.0" description = "A simple distributed SQL database, built for education" authors = ["Erik Grinaker "] license = "Apache-2.0" homepage = "https://github.com/erikgrinaker/toydb" repository = "https://github.com/erikgrinaker/toydb" edition = "2024" default-run = "toydb" publish = false [lib] doctest = false [dependencies] bincode = { version = "2.0", features = ["serde"] } clap = { version = "4.5", features = ["cargo", "derive"] } config = "0.15" crossbeam = { version = "0.8", features = ["crossbeam-channel"] } dyn-clone = "1.0" fs4 = "0.13" hdrhistogram = "7.5" itertools = "0.14" log = "0.4" petname = "2.0.2" rand = "0.10" regex = "1.12" rustyline = "17.0" rustyline-derive = "0.11" serde = { version = "1.0", features = ["derive"] } serde_bytes = "0.11" simplelog = "0.12" uuid = { version = "1.21", features = ["serde", "v4"] } [dev-dependencies] escargot = "0.5" goldenscript = "0.7" hex = "0.4" paste = "1.0" serde_json = "1.0" tempfile = "3.25" test-case = "3.3" test_each_file = "0.3" ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # toyDB Distributed SQL database in Rust, built from scratch as an educational project. Main features: * [Raft distributed consensus][raft] for linearizable state machine replication. * [ACID transactions][txn] with MVCC-based snapshot isolation. * [Pluggable storage engine][storage] with [BitCask][bitcask] and [in-memory][memory] backends. * [Iterator-based query engine][query] with [heuristic optimization][optimizer] and time-travel support. * [SQL interface][sql] including joins, aggregates, and transactions. toyDB is intended to be simple and understandable, and also functional and correct. Other aspects like performance, scalability, and availability are non-goals -- these are major sources of complexity in production-grade databases, and obscure the basic underlying concepts. Shortcuts have been taken where possible. I originally wrote toyDB in 2020 to learn more about database internals. Since then, I've spent several years building real distributed SQL databases at [CockroachDB](https://github.com/cockroachdb/cockroach) and [Neon](https://github.com/neondatabase/neon). Based on this experience, I've rewritten toyDB as a simple illustration of the architecture and concepts behind distributed SQL databases. [raft]: https://github.com/erikgrinaker/toydb/blob/main/src/raft/mod.rs [txn]: https://github.com/erikgrinaker/toydb/blob/main/src/storage/mvcc.rs [storage]: https://github.com/erikgrinaker/toydb/blob/main/src/storage/engine.rs [bitcask]: https://github.com/erikgrinaker/toydb/blob/main/src/storage/bitcask.rs [memory]: https://github.com/erikgrinaker/toydb/blob/main/src/storage/memory.rs [query]: https://github.com/erikgrinaker/toydb/blob/main/src/sql/execution/executor.rs [optimizer]: https://github.com/erikgrinaker/toydb/blob/main/src/sql/planner/optimizer.rs [sql]: https://github.com/erikgrinaker/toydb/blob/main/src/sql/parser/parser.rs ## Documentation * [Architecture guide](docs/architecture/index.md): a guided tour of toyDB's code and architecture. * [SQL examples](docs/examples.md): walkthrough of toyDB's SQL features. * [SQL reference](docs/sql.md): reference documentation for toyDB's SQL dialect. * [References](docs/references.md): research materials used while building toyDB. ## Usage With a [Rust compiler](https://www.rust-lang.org/tools/install) installed, a local five-node cluster can be built and started as: ``` $ ./cluster/run.sh Starting 5 nodes on ports 9601-9605 with data under cluster/*/data/. To connect to node 1, run: cargo run --release --bin toysql toydb4 21:03:55 [INFO] Listening on [::1]:9604 (SQL) and [::1]:9704 (Raft) toydb1 21:03:55 [INFO] Listening on [::1]:9601 (SQL) and [::1]:9701 (Raft) toydb2 21:03:55 [INFO] Listening on [::1]:9602 (SQL) and [::1]:9702 (Raft) toydb3 21:03:55 [INFO] Listening on [::1]:9603 (SQL) and [::1]:9703 (Raft) toydb5 21:03:55 [INFO] Listening on [::1]:9605 (SQL) and [::1]:9705 (Raft) toydb2 21:03:56 [INFO] Starting new election for term 1 [...] toydb2 21:03:56 [INFO] Won election for term 1, becoming leader ``` A command-line client can be built and used with node 1 on `localhost:9601`: ``` $ cargo run --release --bin toysql Connected to toyDB node n1. Enter !help for instructions. toydb> CREATE TABLE movies (id INTEGER PRIMARY KEY, title VARCHAR NOT NULL); toydb> INSERT INTO movies VALUES (1, 'Sicario'), (2, 'Stalker'), (3, 'Her'); toydb> SELECT * FROM movies; 1, 'Sicario' 2, 'Stalker' 3, 'Her' ``` toyDB supports most common SQL features, including joins, aggregates, and transactions. Below is an `EXPLAIN` query plan of a more complex query (fetches all movies from studios that have released any movie with an IMDb rating of 8 or more): ``` toydb> EXPLAIN SELECT m.title, g.name AS genre, s.name AS studio, m.rating FROM movies m JOIN genres g ON m.genre_id = g.id, studios s JOIN movies good ON good.studio_id = s.id AND good.rating >= 8 WHERE m.studio_id = s.id GROUP BY m.title, g.name, s.name, m.rating, m.released ORDER BY m.rating DESC, m.released ASC, m.title ASC; Remap: m.title, genre, studio, m.rating (dropped: m.released) └─ Order: m.rating desc, m.released asc, m.title asc └─ Projection: m.title, g.name as genre, s.name as studio, m.rating, m.released └─ Aggregate: m.title, g.name, s.name, m.rating, m.released └─ HashJoin: inner on m.studio_id = s.id ├─ HashJoin: inner on m.genre_id = g.id │ ├─ Scan: movies as m │ └─ Scan: genres as g └─ HashJoin: inner on s.id = good.studio_id ├─ Scan: studios as s └─ Scan: movies as good (good.rating > 8 OR good.rating = 8) ``` ## Architecture toyDB's architecture is fairly typical for a distributed SQL database: a transactional key/value store managed by a Raft cluster with a SQL query engine on top. See the [architecture guide](./docs/architecture/index.md) for more details. [![toyDB architecture](./docs/architecture/images/architecture.svg)](./docs/architecture/index.md) ## Tests toyDB mainly uses [Goldenscripts](https://github.com/erikgrinaker/goldenscript) for tests. These script various scenarios, capture events and output, and later assert that the behavior remains the same. See e.g.: * [Raft cluster tests](https://github.com/erikgrinaker/toydb/tree/main/src/raft/testscripts/node) * [MVCC transaction tests](https://github.com/erikgrinaker/toydb/tree/main/src/storage/testscripts/mvcc) * [SQL execution tests](https://github.com/erikgrinaker/toydb/tree/main/src/sql/testscripts) * [End-to-end tests](https://github.com/erikgrinaker/toydb/tree/main/tests/scripts) Run tests with `cargo test`, or have a look at the latest [CI run](https://github.com/erikgrinaker/toydb/actions/workflows/ci.yml). ## Benchmarks toyDB is not optimized for performance, but comes with a `workload` benchmark tool that can run various workloads against a toyDB cluster. For example: ```sh # Start a 5-node toyDB cluster. $ ./cluster/run.sh [...] # Run a read-only benchmark via all 5 nodes. $ cargo run --release --bin workload read Preparing initial dataset... done (0.179s) Spawning 16 workers... done (0.006s) Running workload read (rows=1000 size=64 batch=1)... Time Progress Txns Rate p50 p90 p99 pMax 1.0s 13.1% 13085 13020/s 1.3ms 1.5ms 1.9ms 8.4ms 2.0s 27.2% 27183 13524/s 1.3ms 1.5ms 1.8ms 8.4ms 3.0s 41.3% 41301 13702/s 1.2ms 1.5ms 1.8ms 8.4ms 4.0s 55.3% 55340 13769/s 1.2ms 1.5ms 1.8ms 8.4ms 5.0s 70.0% 70015 13936/s 1.2ms 1.5ms 1.8ms 8.4ms 6.0s 84.7% 84663 14047/s 1.2ms 1.4ms 1.8ms 8.4ms 7.0s 99.6% 99571 14166/s 1.2ms 1.4ms 1.7ms 8.4ms 7.1s 100.0% 100000 14163/s 1.2ms 1.4ms 1.7ms 8.4ms Verifying dataset... done (0.002s) ``` The available workloads are: * `read`: single-row primary key lookups. * `write`: single-row inserts to sequential primary keys. * `bank`: bank transfers between various customers and accounts. To make things interesting, this includes joins, secondary indexes, sorting, and conflicts. For more information about workloads and parameters, run `cargo run --bin workload -- --help`. Example workload results are listed below. Write performance is atrocious, due to [fsync](https://en.wikipedia.org/wiki/Sync_(Unix)) and a lack of write batching in the Raft layer. Disabling fsync, or using the in-memory engine, significantly improves write performance (at the expense of durability). | Workload | BitCask | BitCask w/o fsync | Memory | |----------|-------------|-------------------|-------------| | `read` | 14163 txn/s | 13941 txn/s | 13949 txn/s | | `write` | 35 txn/s | 4719 txn/s | 7781 txn/s | | `bank` | 21 txn/s | 1120 txn/s | 1346 txn/s | ## Debugging [VSCode](https://code.visualstudio.com) and the [CodeLLDB](https://marketplace.visualstudio.com/items?itemName=vadimcn.vscode-lldb) extension can be used to debug toyDB, with the debug configuration under `.vscode/launch.json`. Under the "Run and Debug" tab, select e.g. "Debug executable 'toydb'" or "Debug unit tests in library 'toydb'". ## Credits The toyDB logo is courtesy of [@jonasmerlin](https://github.com/jonasmerlin). ================================================ FILE: cluster/run.sh ================================================ #!/usr/bin/env bash # # This script builds and runs a 5-node toyDB cluster listening on ports # 9601-9605. Config and data is stored under the toydb* directories. # To connect a toysql client to node 1 on port 9601, run: # # cargo run --release --bin toysql set -euo pipefail # Change into the script directory. cd "$(dirname $0)" # Build toyDB using release optimizations. cargo build --release --bin toydb # Start nodes 1-5 in the background, prefixing their output with the node ID. echo "Starting 5 nodes on ports 9601-9605 with data under cluster/*/data/." echo "To connect to node 1, run: cargo run --release --bin toysql" echo "" for ID in 1 2 3 4 5; do (cargo run -q --release -- -c toydb$ID/toydb.yaml 2>&1 | sed -e "s/\\(.*\\)/toydb$ID \\1/g") & done # Wait for the background processes to exit. Kill all toyDB processes when the # script exits (e.g. via Ctrl-C). trap 'kill -TERM -- -$$ 2>/dev/null' INT TERM EXIT wait ================================================ FILE: cluster/toydb1/toydb.yaml ================================================ id: 1 data_dir: toydb1/data listen_sql: localhost:9601 listen_raft: localhost:9701 peers: '2': localhost:9702 '3': localhost:9703 '4': localhost:9704 '5': localhost:9705 ================================================ FILE: cluster/toydb2/toydb.yaml ================================================ id: 2 data_dir: toydb2/data listen_sql: localhost:9602 listen_raft: localhost:9702 peers: '1': localhost:9701 '3': localhost:9703 '4': localhost:9704 '5': localhost:9705 ================================================ FILE: cluster/toydb3/toydb.yaml ================================================ id: 3 data_dir: toydb3/data listen_sql: localhost:9603 listen_raft: localhost:9703 peers: '1': localhost:9701 '2': localhost:9702 '4': localhost:9704 '5': localhost:9705 ================================================ FILE: cluster/toydb4/toydb.yaml ================================================ id: 4 data_dir: toydb4/data listen_sql: localhost:9604 listen_raft: localhost:9704 peers: '1': localhost:9701 '2': localhost:9702 '3': localhost:9703 '5': localhost:9705 ================================================ FILE: cluster/toydb5/toydb.yaml ================================================ id: 5 data_dir: toydb5/data listen_sql: localhost:9605 listen_raft: localhost:9705 peers: '1': localhost:9701 '2': localhost:9702 '3': localhost:9703 '4': localhost:9704 ================================================ FILE: config/toydb.yaml ================================================ # The node ID (must be unique in the cluster), and map of peer IDs and Raft # addresses (empty for single node). id: 1 peers: {} # Addresses to listen for SQL and Raft connections on. listen_sql: localhost:9601 listen_raft: localhost:9701 # The log level. Valid values are DEBUG, INFO, WARN, and ERROR. log_level: INFO # Node data directory. The Raft log is stored in the file "raft", and the SQL # database in "sql". data_dir: data # Storage engine to use for the Raft log and SQL database. # # * bitcask (default): an append-only log-structured store. # * memory: an in-memory store using the Rust standard library's BTreeMap. storage_raft: bitcask storage_sql: bitcask # Whether to fsync writes to disk. Disabling this yields much better write # performance, but may lose data on host crashes and violate Raft guarantees. It # only affects Raft log writes (the SQL state machine is never fsynced since it # can be reconstructed from the Raft log). fsync: true # The minimum garbage fraction and bytes to trigger Bitcask log compaction on # node startup. compact_threshold: 0.2 compact_min_bytes: 1000000 ================================================ FILE: docs/architecture/README.md ================================================ See [`index.md`](index.md). ================================================ FILE: docs/architecture/client.md ================================================ # Client The toyDB client is in the [`client`](https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/client.rs) module. It uses the same Bincode-based protocol that we saw in the server section, sending `toydb::Request` and receiving `toydb::Response`. ## Client Library The main client library `toydb::Client` is used to communicate with a toyDB server: https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/client.rs#L15-L24 When initialized, it connects to a toyDB server over TCP, which establishes a SQL session for it: https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/client.rs#L27-L33 It can then send Bincode-encoded `toydb::Request` to the server, and receive `toydb::Response` back. https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/client.rs#L35-L40 In particular, `Client::execute` can be used to execute arbitrary SQL statements in the client's current session: https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/client.rs#L42-L56 ## `toysql` Binary However, `toydb::Client` is a programmatic API, and we want a more convenient user interface. The `toysql` client in [`src/bin/toysql.rs`](https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/bin/toysql.rs) provides a typical [REPL](https://en.wikipedia.org/wiki/Read–eval–print_loop) (read-evaluate-print loop) where users can enter SQL statements and view the results. Like `toydb`, `toysql` is a tiny [`clap`](https://docs.rs/clap/latest/clap/) command that takes a toyDB server address to connect to and starts an interactive shell: https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/bin/toysql.rs#L29-L53 It first attempts to connect to the toyDB server using the `toydb::Client` client, and then starts an interactive shell using the [Rustyline](https://docs.rs/rustyline/latest/rustyline/) library. https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/bin/toysql.rs#L55-L81 The shell is simply a loop that prompts the user to input a SQL statement: https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/bin/toysql.rs#L216-L250 Each statement is the executed against the server via `toydb::Client::execute`, and the response is formatted and printed as output: https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/bin/toysql.rs#L83-L92 https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/bin/toysql.rs#L175-L204 And with that, we have a fully functional SQL database system and can run queries to our heart's content. Have fun! ---

Server

================================================ FILE: docs/architecture/encoding.md ================================================ # Key/Value Encoding The key/value store uses binary `Vec` keys and values, so we need an encoding scheme to translate between in-memory Rust data structures and the on-disk binary data. This is provided by the [`encoding`](https://github.com/erikgrinaker/toydb/tree/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/encoding) module, with separate schemes for key and value encoding. ## `Bincode` Value Encoding Values are encoded using [Bincode](https://github.com/bincode-org/bincode), a third-party binary encoding scheme for Rust. Bincode is convenient because it can easily encode any arbitrary Rust data type. But we could also have chosen e.g. [JSON](https://en.wikipedia.org/wiki/JSON), [Protobuf](https://protobuf.dev), [MessagePack](https://msgpack.org/), or any other encoding. We won't dwell on the actual binary format here, see the [Bincode specification](https://git.sr.ht/~stygianentity/bincode/tree/trunk/item/docs/spec.md) for details. To use a consistent configuration for all encoding and decoding, we provide helper functions in the [`encoding::bincode`](https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/encoding/bincode.rs) module which use `bincode::config::standard()`. https://github.com/erikgrinaker/toydb/blob/0ce1fb34349fda043cb9905135f103bceb4395b4/src/encoding/bincode.rs#L15-L27 Bincode uses the very common [Serde](https://serde.rs) framework for its API. toyDB also provides an `encoding::Value` helper trait for value types which adds automatic `encode()` and `decode()` methods: https://github.com/erikgrinaker/toydb/blob/b57ae6502e93ea06df00d94946a7304b7d60b977/src/encoding/mod.rs#L39-L68 Here's an example of how this can be used to encode and decode an arbitrary `Dog` data type: ```rust #[derive(serde::Serialize, serde::Deserialize)] struct Dog { name: String, age: u8, good_boy: bool, } impl encoding::Value for Dog {} let pluto = Dog { name: "Pluto".into(), age: 4, good_boy: true }; let bytes = pluto.encode(); println!("{bytes:02x?}"); // Outputs [05, 50, 6c, 75, 74, 6f, 04, 01]: // // * Length of string "Pluto": 05. // * String "Pluto": 50 6c 75 74 6f. // * Age 4: 04. // * Good boy: 01 (true). let pluto = Dog::decode(&bytes)?; // gives us back Pluto ``` ## `Keycode` Key Encoding Unlike values, keys can't just use any binary encoding like Bincode. As mentioned in the storage section, the storage engine sorts data by key to enable range scans. The key encoding must therefore preserve the [lexicographical order](https://en.wikipedia.org/wiki/Lexicographic_order) of the encoded values: the binary byte slices must sort in the same order as the original values. As an example of why we can't just use Bincode, consider the strings "house" and "key". These should be sorted in alphabetical order: "house" before "key". However, Bincode encodes strings prefixed by their length, so "key" would be sorted before "house" in binary form: ``` 03 6b 65 79 ← 3 bytes: key 05 68 6f 75 73 65 ← 5 bytes: house ``` For similar reasons, we can't just encode numbers in their native binary form: the [little-endian](https://en.wikipedia.org/wiki/Endianness) representation will order very large numbers before small numbers, and the [sign bit](https://en.wikipedia.org/wiki/Sign_bit) will order positive numbers before negative numbers. This would violate the ordering of natural numbers. We also have to be careful with value sequences, which should be ordered element-wise. For example, the pair ("a", "xyz") should be ordered before ("ab", "cd"), so we can't just encode the strings one after the other like "axyz" and "abcd" since that would sort ("ab", "cd") first. toyDB provides an order-preserving encoding called "Keycode" in the [`encoding::keycode`](https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/encoding/keycode.rs) module. Like Bincode, the Keycode encoding is not self-describing: the binary data does not say what the data type is, the caller must provide a type to decode into. It only supports a handful of primitive data types, and only needs to order values of the same type. Keycode is implemented as a [Serde](https://serde.rs) (de)serializer, which requires a lot of boilerplate code to satisfy the trait, but we'll just focus on the actual encoding. The encoding scheme is as follows: * `bool`: `00` for `false` and `01` for `true`. https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L113-L117 * `u64`: the [big-endian](https://en.wikipedia.org/wiki/Endianness) binary encoding. https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L157-L161 * `i64`: the [big-endian](https://en.wikipedia.org/wiki/Endianness) binary encoding, but with the sign bit flipped to order negative numbers before positive ones. https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L131-L143 * `f64`: the [big-endian IEEE 754](https://en.wikipedia.org/wiki/Double-precision_floating-point_format) binary encoding, but with the sign bit flipped, and all bits flipped for negative numbers, to order negative numbers correctly. https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L167-L179 * `Vec`: terminated by `00 00`, with `00` escaped as `00 ff` to disambiguate it. https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L190-L205 * `String`: like `Vec`. https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L185-L188 * `Vec`, `[T]`, `(T,)`: the concatenation of the inner values. https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L295-L307 * `enum`: the variant's numerical index as a `u8`, then the inner values (if any). https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L223-L227 Like `encoding::Value`, there is also an `encoding::Key` helper trait: https://github.com/erikgrinaker/toydb/blob/b57ae6502e93ea06df00d94946a7304b7d60b977/src/encoding/mod.rs#L20-L37 Different kinds of keys are usually represented as enums. For example, if we wanted to store cars and video games, we could use: ```rust #[derive(serde::Serialize, serde::Deserialize)] enum Key { Car(String, String, u64), // make, model, year Game(String, u64, Platform), // name, year, platform } #[derive(serde::Serialize, serde::Deserialize)] enum Platform { PC, PS5, Switch, Xbox, } impl encoding::Key for Key {} let returnal = Key::Game("Returnal".into(), 2021, Platform::PS5); let bytes = returnal.encode(); println!("{bytes:02x?}"); // Outputs [01, 52, 65, 74, 75, 72, 6e, 61, 6c, 00, 00, 00, 00, 00, 00, 00, 00, 07, e5, 01]. // // * Key::Game: 01 // * Returnal: 52 65 74 75 72 6e 61 6c 00 00 // * 2021: 00 00 00 00 00 00 07 e5 // * Platform::PS5: 01 let returnal = Key::decode(&bytes)?; ``` Because the keys are sorted in element-wise order, this would allow us to e.g. perform a prefix scan to fetch all platforms which Returnal (2021) was released on, or perform a range scan to fetch all models of Nissan Altima released between 2010 and 2015. ---

Storage Engine   |   MVCC Transactions

================================================ FILE: docs/architecture/index.md ================================================ # toyDB Architecture toyDB is a simple distributed SQL database, intended to illustrate how such systems are built. The overall structure is similar to real-world distributed databases, but the design and implementation has been kept as simple as possible for understandability. Performance and scalability are explicit non-goals, as these are major sources of complexity in real-world systems. This guide will walk through toyDB's architecture and code from the bottom up, with plenty of links to the actual source code. > ℹ️ View on GitHub with a desktop browser for inline code listings. * [Overview](overview.md) * [Properties](overview.md#properties) * [Components](overview.md#components) * [Storage Engine](storage.md) * [`Memory` Storage Engine](storage.md#memory-storage-engine) * [`BitCask` Storage Engine](storage.md#bitcask-storage-engine) * [Key/Value Encoding](encoding.md) * [`Bincode` Value Encoding](encoding.md#bincode-value-encoding) * [`Keycode` Key Encoding](encoding.md#keycode-key-encoding) * [MVCC Transactions](mvcc.md) * [Raft Consensus](raft.md) * [Log Storage](raft.md#log-storage) * [State Machine Interface](raft.md#state-machine-interface) * [Node Roles](raft.md#node-roles) * [Node Interface and Communication](raft.md#node-interface-and-communication) * [Leader Election and Terms](raft.md#leader-election-and-terms) * [Client Requests and Forwarding](raft.md#client-requests-and-forwarding) * [Write Replication and Application](raft.md#write-replication-and-application) * [Read Processing](raft.md#read-processing) * [SQL Engine](sql.md) * [Data Model](sql-data.md) * [Data Types](sql-data.md#data-types) * [Schemas](sql-data.md#schemas) * [Expressions](sql-data.md#expressions) * [Storage](sql-storage.md) * [Key/Value Representation](sql-storage.md#keyvalue-representation) * [Schema Catalog](sql-storage.md#schema-catalog) * [Row Storage and Transactions](sql-storage.md#row-storage-and-transactions) * [Raft Replication](sql-raft.md) * [Parsing](sql-parser.md) * [Lexer](sql-parser.md#lexer) * [Abstract Syntax Tree](sql-parser.md#abstract-syntax-tree) * [Parser](sql-parser.md#parser) * [Planning](sql-planner.md) * [Execution Plan](sql-planner.md#execution-plan) * [Scope and Name Resolution](sql-planner.md#scope-and-name-resolution) * [Planner](sql-planner.md#planner) * [Optimization](sql-optimizer.md) * [Constant Folding](sql-optimizer.md#constant-folding) * [Filter Pushdown](sql-optimizer.md#filter-pushdown) * [Index Lookups](sql-optimizer.md#index-lookups) * [Hash Join](sql-optimizer.md#hash-join) * [Short Circuiting](sql-optimizer.md#short-circuiting) * [Execution](sql-execution.md) * [Plan Executor](sql-execution.md#plan-executor) * [Session Management](sql-execution.md#session-management) * [Server](server.md) * [Raft Routing](server.md#raft-routing) * [SQL Service](server.md#sql-service) * [`toydb` Binary](server.md#toydb-binary) * [Client](client.md) * [Client Library](client.md#client-library) * [`toysql` Binary](client.md#toysql-binary) ---

Overview

================================================ FILE: docs/architecture/mvcc.md ================================================ # MVCC Transactions Transactions are groups of reads and writes (e.g. to different keys) that are submitted together as a single unit. For example, a bank transaction that transfers $100 from account A to account B might consist of this group of reads and writes: ``` a = get(A) b = get(B) if a < 100: error("insufficient balance") set(A, a - 100) set(B, b + 100) ``` toyDB provides [ACID](https://en.wikipedia.org/wiki/ACID) transactions, a set of very strong guarantees: * **Atomicity:** all of the writes take effect as an single, atomic unit, at the same instant, when they are _committed_. Other users will never see some of the writes without the others. * **Consistency:** database constraints are never violated (e.g. referential integrity or uniqueness contraints). We'll see how this is implemented later in the SQL execution layer. * **Isolation:** users should appear to have the entire database to themselves, unaffected by other simultaneous users. Two transactions may conflict, in which case one has to retry, but if a transaction succeeds then the user knows with certainty that the operations were executed without interference by anyone else. This eliminates the risk of [race conditions](https://en.wikipedia.org/wiki/Race_condition). * **Durability:** committed writes are never lost (even if the system crashes). To illustrate how transactions work, here's an example MVCC test script where two concurrent users modify a set of bank accounts (there's many [other test scripts](https://github.com/erikgrinaker/toydb/tree/aa14deb71f650249ce1cab8828ed7bcae2c9206e/src/storage/testscripts/mvcc) there too): https://github.com/erikgrinaker/toydb/blob/a73e24b7e77671b9f466e0146323cd69c3e27bdf/src/storage/testscripts/mvcc/bank#L1-L69 To provide these guarantees, toyDB uses a common technique called [Multi-Version Concurrency Control](https://en.wikipedia.org/wiki/Multiversion_concurrency_control) (MVCC). It is implemented at the key/value storage level, in the [`storage::mvcc`](https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs) module. It uses a `storage::Engine` for actual data storage. https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L220-L231 MVCC provides an [isolation level](https://en.wikipedia.org/wiki/Isolation_(database_systems)#Isolation_levels) called [snapshot isolation](https://en.wikipedia.org/wiki/Snapshot_isolation): a transaction sees a snapshot of the database as it was when the transaction began. Any later changes are invisible to it. It does this by storing historical versions of key/value pairs. The version number is simply a number that's incremented for every new transaction: https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L155-L158 Each transaction has its own unique version number. When it writes a key/value pair it appends its version number to the key as `Key::Version(&[u8], Version)` (using the Keycode encoding we've seen previously). If an old version of the key already exists, it will have a different version number suffix and therefore be stored as a separate key in the storage engine. Deleted keys are versions with a special tombstone value. https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L183-L189 Here's a simple diagram of what a history of versions 1 to 5 of keys `a` to `d` might look like: https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L11-L26 Additionally, we need to keep track of the currently ongoing (uncommitted) transaction versions, known as the "active set". With versioning and the active set, we can summarize the MVCC protocol with a few simple rules: 1. When a new transaction begins, it: * Obtains the next available version number. * Takes a snapshot of the active set (other uncommitted transactions). * Adds its version number to the active set. 2. When the transaction reads a key, it: * Returns the latest version of the key at or below its own version. * Ignores versions above its own version. * Ignores versions in its active set snapshot. 3. When the transaction writes a key, it: * Looks for a key version above its own version; errors if found. * Looks for a key version in its active set snapshot; errors if found. * Writes a key/value pair with its own version. 4. When the transaction commits, it: * Flushes all writes to disk. * Removes itself from the active set. The magic happens when the transaction removes itself from the active set. This is a single, atomic operation, and when it completes all of its writes immediately become visible to _new_ transactions. However, ongoing transactions still won't see these writes, because the version is still in their active set snapshot or at a later version (hence they are isolated from this transaction). Furthermore, the transaction could see its own uncommitted writes even though noone else could, and if any writes conflicted with another transaction it would error out and have to retry. Not only that, this also allows us to do time-travel queries, where we can query the database as it was at any time in the past: we simply pick a version number to read at. There are a few more details that we've left out here: transaction rollbacks need to keep track of the writes and undo them, and read-only queries can avoid allocating new version numbers. We also don't garbage collect old version, for simplicity. See the module documentation for more details: https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L1-L140 Let's walk through a simple example with code pointers to get a feel for how this is implemented. Notice how we don't have to deal with any version numbers when we're using the MVCC API -- this is an internal MVCC implementation detail. ```rust // Open a BitCask database in the file "toy.db" with MVCC support. let path = PathBuf::from("toy.db"); let db = MVCC::new(BitCask::new(path)?); // Begin a new transaction. let txn = db.begin()?; // Read the key "foo", and decode the binary value as a u64 with bincode. let bytes = txn.get(b"foo")?.expect("foo not found"); let mut value: u64 = bincode::deserialize(&bytes)?; // Delete "foo". txn.delete(b"foo")?; // Add 1 to the value, and write it back to the key "bar". value += 1; let bytes = bincode::serialize(&value); txn.set(b"bar", bytes)?; // Commit the transaction. txn.commit()?; ``` First, we begin a new transaction with `MVCC::begin()`, which calls through to `Transaction::begin()`. This obtains a version number stored in `Key::NextVersion` and increments it, then takes a snapshot of the active set in `Key::ActiveSet` and adds itself to it: https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L368-L391 This returns a `Transaction` object which provides the main key/value API, with get/set/delete methods. It keeps track of the main state of the transaction: it's version number and active set. https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L294-L327 Next, we call `Transaction::get(b"foo")` to read the value of the key `foo`. This finds the latest version that's visible to us (ignoring future versions and the active set). Recall that we store multiple version of each key as `Key::Version(key, version)`. The Keycode encoding ensures that all versions are stored in sorted order, so we can do a reverse range scan from `Key::Version(b"foo", self.version)` to `Key::Version(b"foo", 0)` and return the latest version that's visible to us: https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L564-L581 We then call `Transaction::delete(b"foo")` and `Transaction::set(b"bar", value)`. Both of these just call through to the same `Transaction::write_version()` method, but use `Some(value)` for a regular key/value pair and `None` as a deletion tombstone: https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L514-L522 To write a new version of a key, we first have to check for conflicts by seeing if there's a version of the key that's invisible to us -- if it is, we conflicted with a concurrent transaction. We use a range scan for this, like we did in `Transaction::get()`. If there are no conflicts, we go on to write `Key::Version(b"foo", self.version)` and encode the value as an `Option` to accomodate the `None` tombstone marker. We also write a `Key::TxnWrite(version, key)` to keep track of the keys we've written in case we have to roll back. https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L524-L562 Finally, `Transaction::commit()` will make our transaction take effect and become visible. It does this simply by removing itself from the active set in `Key::ActiveSet`, and also cleaning up its `Key::TxnWrite` write tracking. As the comment says, we don't actually have to flush to durable storage here, because the Raft log will provide durability for us -- we'll get back to this later. https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L466-L485 ---

Key/Value Encoding   |   Raft Consensus

================================================ FILE: docs/architecture/overview.md ================================================ # Overview toyDB consists of a cluster of nodes that execute [SQL](https://en.wikipedia.org/wiki/SQL) transactions against a replicated state machine. Clients can connect to any node in the cluster and submit SQL statements. The cluster remains available if a minority of nodes crash or disconnect, but halts if a majority of nodes fail. ## Properties * **Distributed:** runs across a cluster of nodes. * **Highly available:** tolerates failure of a minority of nodes. * **SQL compliant:** correctly supports most common [SQL](https://en.wikipedia.org/wiki/SQL) features. * **Strongly consistent:** committed writes are immediately visible to all readers ([linearizability](https://en.wikipedia.org/wiki/Linearizability)). * **Transactional:** provides [ACID](https://en.wikipedia.org/wiki/ACID) transactions * **Atomic:** groups of writes are applied as a single, atomic unit. * **Consistent:** database constraints and referential integrity are always enforced. * **Isolated:** concurrent transactions don't affect each other ([snapshot isolation](https://en.wikipedia.org/wiki/Snapshot_isolation)). * **Durable:** committed writes are never lost. For simplicity, toyDB is: * **Not scalable:** every node stores the full dataset, and reads/writes execute on one node. * **Not reliable:** only handles crash failures, not e.g. partial network partitions or node stalls. * **Not performant:** data processing is slow, and not optimized at all. * **Not efficient:** loads entire tables into memory, no compression or garbage collection, etc. * **Not full-featured:** only basic SQL functionality is implemented. * **Not backwards compatible:** changes to data formats and protocols will break databases. * **Not flexible:** nodes can't be added or removed while running, and take a long time to join. * **Not secure:** there is no authentication, authorization, nor encryption. ## Components Internally, toyDB is made up of a few main components: * **Storage engine:** stores data on disk and manages transactions. * **Raft consensus engine:** replicates data and coordinates cluster nodes. * **SQL engine:** organizes SQL data, manages SQL sessions, and executes SQL statements. * **Server:** manages network communication, both with SQL clients and Raft nodes. * **Client:** provides a SQL user interface and communicates with the server. This diagram illustrates the internal structure of a single toyDB node: ![toyDB architecture](./images/architecture.svg) We will go through each of these components from the bottom up. ---

toyDB Architecture   |   Storage Engine

================================================ FILE: docs/architecture/raft.md ================================================ # Raft Consensus [Raft](https://raft.github.io) is a distributed consensus protocol which replicates data across a cluster of nodes in a consistent and durable manner. It is described in the very readable [Raft paper](https://raft.github.io/raft.pdf), and in the more comprehensive [Raft thesis](https://web.stanford.edu/~ouster/cgi-bin/papers/OngaroPhD.pdf). The toyDB Raft implementation is in the [`raft`](https://github.com/erikgrinaker/toydb/tree/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/raft) module, and is described in the module documentation: https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/mod.rs#L1-L240 Raft is fundamentally the same protocol as [Paxos](https://lamport.azurewebsites.net/pubs/paxos-simple.pdf) and [Viewstamped Replication](https://pmg.csail.mit.edu/papers/vr-revisited.pdf), but an opinionated variant designed to be simple, understandable, and practical. It is widely used in the industry: [CockroachDB](https://www.cockroachlabs.com), [TiDB](https://www.pingcap.com), [etcd](https://etcd.io), [Consul](https://developer.hashicorp.com/consul), and many others. Briefly, Raft elects a leader node which coordinates writes and replicates them to followers. Once a majority (>50%) of nodes have acknowledged a write, it is considered durably committed. It is common for the leader to also serve reads, since it always has the most recent data and is thus strongly consistent. A cluster must have a majority of nodes (known as a [quorum](https://en.wikipedia.org/wiki/Quorum_(distributed_computing))) live and connected to remain available, otherwise it will not commit writes in order to guarantee data consistency and durability. Since there can only be one majority in the cluster, this prevents a [split brain](https://en.wikipedia.org/wiki/Split-brain_(computing)) scenario where two active leaders can exist concurrently (e.g. during a [network partition](https://en.wikipedia.org/wiki/Network_partition)) and store conflicting values. The Raft leader appends writes to an ordered command log, which is then replicated to followers. Once a majority has replicated the log up to a given entry, that log prefix is committed and then applied to a state machine. This ensures that all nodes will apply the same commands in the same order and eventually reach the same state (assuming the commands are deterministic). Raft itself doesn't care what the state machine and commands are, but in toyDB's case it's SQL tables and rows stored in an MVCC key/value store. This diagram from the Raft paper illustrates how a Raft node receives a command from a client (1), adds it to its log and reaches consensus with other nodes (2), then applies it to its state machine (3) before returning a result to the client (4): Raft node You may notice that Raft is not very scalable, since all reads and writes go via the leader node, and every node must store the entire dataset. Raft solves replication and availability, but not scalability. Real-world systems typically provide horizontal scalability by splitting a large dataset across many separate Raft clusters (i.e. sharding), but this is out of scope for toyDB. For simplicitly, toyDB implements the bare minimum of Raft, and omits optimizations described in the paper such as state snapshots, log truncation, leader leases, and more. The implementation is in the [`raft`](https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/mod.rs) module, and we'll walk through the main components next. There is a comprehensive set of Raft test scripts in [`src/raft/testscripts/node`](https://github.com/erikgrinaker/toydb/blob/386153f5c00cb1a88b1ac8489ae132674d96f68a/src/raft/testscripts/node), which illustrate the protocol in a wide variety of scenarios. ## Log Storage Raft replicates an ordered command log consisting of `raft::Entry`: https://github.com/erikgrinaker/toydb/blob/90a6cae47ac20481ac4eb2f20eea50f02e6c2b33/src/raft/log.rs#L10-L28 `index` specifies the position in the log, and `command` contains the binary command to apply to the state machine. The `term` identifies the leadership term in which the command was proposed: a new term begins when a new leader election is held (we'll get back to this later). Entries are appended to the log by the leader and replicated to followers. Once acknowledged by a quorum, the log up to that index is committed and will never change. Entries that are not yet committed may be replaced or removed if the leader changes. The Raft log enforces the following invariants: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L80-L91 `raft::Log` implements a Raft log, and stores log entries in a `storage::Engine` key/value store: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L43-L116 It also stores some additional metadata that we'll need later: the current term, vote, and commit index. These are stored as separate keys: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L30-L39 Individual entries are appended to the log via `Log::append`, typically when the leader wants to replicate a new write: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L190-L203 Entries can also be appended in bulk via `Log::splice`, typically when entries are replicated to followers. This also allows replacing existing uncommitted entries, e.g. after a leader change: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L269-L343 Committed entries are marked by `Log::commit`, making them immutable and eligible for state machine application: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L205-L222 The log also has methods to read entries from the log, either individually as `Log::get` or by iterating over a range with `Log::scan`: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L224-L267 ## State Machine Interface Raft doesn't know or care what the log commands are, nor what the state machine does with them. It simply takes `raft::Entry` from the log and gives them to the state machine. The Raft state machine is represented by the `raft::State` trait. Raft will ask about the last applied entry via `State::get_applied_index`, and feed it newly committed entries via `State::apply`. It also allows reads via `State::read`, but we'll get back to that later. https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/state.rs#L4-L51 The state machine does not have to flush its state to durable storage after each transition; on node crashes, the state machine is allowed to regress, and will be caught up by replaying the unapplied log entries. It is also possible to implement a purely in-memory state machine (and in fact, toyDB allows running the state machine with a `Memory` storage engine). The state machine must take care to be deterministic: the same commands applied in the same order must result in the same state across all nodes. This means that a command can't e.g. read the current time or generate a random number -- these values must be included in the command. It also means that non-deterministic errors, such as an IO error, must halt command application (in toyDB's case, we just panic and crash the node). In toyDB's, the state machine is an MVCC key/value store that stores SQL tables and rows, as we'll see in the SQL Raft replication section. ## Node Roles In Raft, a node can have one out of three roles: * **Leader:** replicates writes to followers and serves client requests. * **Follower:** replicates writes from a leader. * **Candidate:** campaigns for leadership. The Raft paper summarizes these roles and transitions in the following diagram (we'll discuss leader election in detail below): Raft states In toyDB, a node is represented by the `raft::Node` enum, with variants for each state: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L47-L66 This wraps the `raft::RawNode` type which contains the inner node state. It is generic over the role, and uses the [typestate pattern](http://cliffle.com/blog/rust-typestate/) to provide methods and transitions depending on the node's current role. This enforces state transitions and invariants at compile time via Rust's type system -- for example, only `RawNode` has an `into_leader()` method, since only candidates can transition to leaders (when they win an election). https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L156-L177 The `RawNode::role` field contains role-specific state as structs implementing the `Role` marker trait: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L661-L680 https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L242-L255 https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L523-L531 We'll see what the various fields are used for in the following sections. ## Node Interface and Communication The `raft::Node` enum has two main methods that drive the node: `tick()` and `step()`. These consume the current node and return a new node, possibly with a different role. `tick()` advances time by a logical tick. This is used to measure the passage of time, e.g. to trigger election timeouts or periodic leader heartbeats. toyDB uses a tick interval of 100 milliseconds (see `raft::TICK_INTERVAL`), and will call `tick()` on the node at this rate. https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L125-L132 `step()` processes an inbound message from a different node or client: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L107-L123 Outbound messages to other nodes are sent via the `RawNode::tx` channel: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L171-L172 Nodes are identified by a unique node ID, which is given at node startup: https://github.com/erikgrinaker/toydb/blob/90a6cae47ac20481ac4eb2f20eea50f02e6c2b33/src/raft/node.rs#L17-L18 Messages are wrapped in a `raft::Envelope` specifying the sender and recipient: https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/message.rs#L10-L21 The envelope contains a `raft::Message`, an enum which encodes the Raft message protocol. We won't dwell on the specific message types here, but discuss them invididually in the following sections. Raft does not require reliable message delivery, so messages may be dropped or reordered at any time, although toyDB's use of TCP provides stronger delivery guarantees. https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/message.rs#L25-L152 This is an entirely synchronous and deterministic model -- the same sequence of calls on a given node in a given initial state will always produce the same result. This is very convenient for testing and understandability. We will see in the server section how toyDB drives the node on a separate thread, provides a network transport for messages, and ticks it at regular intervals. ## Leader Election and Terms In the steady state, Raft simply has a leader which replicates writes to followers. But to reach this steady state, we must elect a leader, which is where much of the subtle complexity lies. See the Raft paper for comprehensive details and safety arguments, we'll summarize it briefly below. Raft divides time into _terms_. The term is a monotonically increasing number starting at 1. There can only be one leader in a term (or none if an election fails), and the term can never regress. Replicated commands belong to the specific term under which they were proposed. https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L20-L21 Let's walk through an election, where we bootstrap a brand new, empty toyDB cluster with 3 nodes. Nodes are initialized by calling `Node::new()`. Since this is a new cluster, they are given an empty `raft::Log` and `raft::State`, at term 0. Nodes start with role `Follower`, but without a leader. https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L68-L87 https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L266-L290 Now, nothing really happens for a while, as the nodes are waiting to maybe hear from an existing leader (there is none). Every 100 ms we call `tick()`, until we reach `election_timeout`: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L489-L497 Notice how `new()` set `election_timeout` to a random value (in the range `ELECTION_TIMEOUT_RANGE` of 10-20 ticks, i.e. 1-2 seconds). If all nodes had the same timeout, they would likely campaign for leadership simultaneously, resulting in an election tie -- Raft uses randomized election timeouts to avoid such ties. Once a node reaches `election_timeout` it transitions to role `Candidate`: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L292-L312 When it becomes a candidate it campaigns for leadership by increasing its term to 1, voting for itself, and sending `Message::Campaign` to all peers asking for their vote: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L647-L658 In Raft, the term can't regress, and a node can only cast a single vote in each term (even across restarts), so both of these are persisted to disk via `Log::set_term_vote()`. When the two other nodes (still in state `Follower`) receive the `Message::Campaign` asking for a vote, they will first increase their term to 1 (since this is a newer term than their local term 0): https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L347-L351 They then grant the vote since they haven't yet voted for anyone else in term 1. They persist the vote to disk via `Log::set_term_vote()` and return a `Message::CampaignResponse { vote: true }` to the candidate: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L424-L449 They also check that the candidate's log is at least as long as theirs, which is trivially true in this case since the log is empty. This is necessary to ensure that a leader has all committed entries (see section 5.4.1 in the Raft paper). When the candidate receives the `Message::CampaignResponse` it records the vote from each node. Once it has a quorum (in this case 2 out of 3 votes including its own vote) it becomes leader in term 1: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L599-L606 When it becomes leader, it sends a `Message::Heartbeat` to all peers to tell them it is now the leader in term 1. It also appends an empty entry to its log and replicates it, but we will ignore this for now (see section 5.4.2 in the Raft paper for why). https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L563-L583 When the other nodes receive the heartbeat, they become followers of the new leader in its term: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L359-L384 From now on, the leader will send periodic `Message::Heartbeat` every 4 ticks (see `HEARTBEAT_INTERVAL`) to assert its leadership: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L945-L953 The followers record when they last received any message from the leader (including heartbeats), and will hold a new election if they haven't heard from the leader in an election timeout (e.g. due to a leader crash or network partition): https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L353-L356 https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L489-L497 This entire process is illustrated in the test script [`election`](https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/election), along with several other test scripts that show e.g. [election ties](https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/election_tie), [contested elections](https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/election_contested), and other scenarios: https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/election#L1-L72 ## Client Requests and Forwarding Once a leader has been elected, we can submit read and write requests to it. This is done by stepping a `Message::ClientRequest` into the node using the local node ID, with a unique request ID (toyDB uses UUIDv4), and waiting for an outbound response message with the same ID: https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/message.rs#L134-L151 https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/message.rs#L164-L188 The requests and responses themselves are arbitrary binary data which is interpreted by the state machine. For our purposes here, let's pretend the requests are: * `Request::Write("key=value")` → `Response::Write("ok")` * `Request::Read("key")` → `Response::Read("value")` The fundamental difference between read and write requests are that write requests are replicated through Raft and executed on all nodes, while read requests are only executed on the leader without being appended to the log. It would be possible to execute reads on followers too, for load balancing, but these reads would be eventually consistent and thus violate linearizability, so toyDB only executes reads on the leader. If a request is submitted to a follower, it will be forwarded to the leader and the response forwarded back to the client (distinguished by the sender/recipient node ID -- a local client always uses the local node ID): https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L451-L474 For simplicity, we cancel the request with `Error::Abort` if a request is submitted to a candidate, and similarly if a follower changes its role to candidate or discovers a new leader. We could have held on to these and redirected them to a new leader, but we keep it simple and ask the client to retry. We'll look at the actual read and write request processing next. ## Write Replication and Application When the leader receives a write request, it proposes the command for replication to followers. It keeps track of the in-flight write and its log entry index in `writes`, such that it can respond to the client with the command result once the entry has been committed and applied. https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L895-L904 To propose the command, the leader appends it to its log and sends a `Message::Append` to each follower to replicate it to their logs: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L966-L980 In steady state, `Message::Append` just contains the single log entry we appended above: https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/message.rs#L87-L108 However, sometimes followers may be lagging behind the leader (e.g. after a crash), or their log may have diverged from the leader (e.g. unsuccessful proposals from a stale leader after a network partition). To handle these cases, the leader tracks the replication progress of each follower as `raft::Progress`: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L682-L698 We'll gloss over these cases here (see the Raft paper and the code in `raft::Progress` and `maybe_send_append()` for details). In the steady state, where each entry is successfully appended and replicated one at a time, `maybe_send_append()` will fall through to the bottom and send a single entry: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L1068-L1128 The `Message::Append` contains the index/term of the entry immediately before the new entry as `base_index` and `base_term`. If the follower's log also contains an entry with this index and term then its log is guaranteed to match (be equal to) the leader's log up to this entry (see section 5.3 in the Raft paper). The follower can then append the new log entry and return a `Message::AppendResponse` confirming that the entry was appended and that its log matches the leader's log up to `match_index`: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L386-L410 When the leader receives the `Message::AppendResponse`, it will update its view of the follower's `match_index`. https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L844-L858 https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L701-L710 Once a quorum of nodes (in our case 2 out of 3 including the leader) have the entry in their log, the leader can commit the entry and apply it to the state machine. It also looks up the in-flight write request from `writes` and sends the command result back to the client as `Message::ClientResponse`: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L982-L1032 The leader will also propagate the new commit index to followers via the next heartbeat, so that they can also apply any pending log entries to their state machine. This isn't strictly necessary, since reads are executed on the leader and nodes have to apply pending entries before becoming leaders, but we do it anyway so that they don't fall too far behind on application. https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L359-L384 This process is illustrated in the test scripts [`append`](https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/append) and [`heartbeat_commits_follower`](https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/heartbeat_commits_follower) (along with many other scenarios): https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/append#L1-L43 https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/heartbeat_commits_follower#L1-L50 ## Read Processing For linearizable (aka strongly consistent) reads, we must execute read requests on the leader, as mentioned above. However, this is not sufficient: under e.g. a network partition, a node may think it's still the leader while in fact a different leader has been elected elsewhere (in a later term) and executed writes there. To handle this case, the leader must confirm that it is still the leader for each read, by sending a `Message::Read` to its followers containing a read sequence number. Only if a quorum confirms that it is still the leader can the read be executed. This incurs an additional network roundtrip, which is clearly inefficient, so real-world systems often use leader leases instead (see section 6.4.1 of the Raft _thesis_, not the paper) -- but it's fine for toyDB. https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/message.rs#L125-L132 When the leader receives the read request, it increments the read sequence number, stores the pending read request in `reads`, and sends a `Message::Read` to all followers: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L906-L917 When the followers receive the `Message::Read`, they simply respond with a `Message::ReadResponse` if it's from their current leader (messages from stale terms are ignored): https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L342-L346 https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L412-L422 When the leader receives the `Message::ReadResponse` it records it in the peer's `Progress`, and executes the read once a quorum have confirmed the sequence number: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L860-L866 https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L1034-L1066 We now have a Raft-managed state machine with replicated writes and linearizable reads. ---

MVCC Transactions   |   SQL Engine

================================================ FILE: docs/architecture/server.md ================================================ # Server Now that we've gone over the individual components, we'll tie them all together in the toyDB server `toydb::Server`, located in the [`server`](https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs) module. The server wraps an inner Raft node `raft::Node`, which manages the SQL state machine, and is responsible for routing network traffic between the Raft node, its Raft peers, and SQL clients. https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L27-L44 For network protocol, the server uses the Bincode encoding that we've discussed in the encoding section, sent over a TCP connection. There's no need for any further framing, since Bincode knows how many bytes to expect for each message depending on the type it's decoding into. The server does not use [async Rust](https://rust-lang.github.io/async-book/) and e.g. [Tokio](https://tokio.rs), instead opting for regular OS threads. Async Rust can significantly complicate the code, which would obscure the main concepts, and any efficiency gains would be entirely irrelevant for toyDB. Internally in the server, messages are passed around between threads using [Crossbeam channels](https://docs.rs/crossbeam/latest/crossbeam/channel/index.html). The main server loop `Server::serve()` listens for inbound TCP connections on port 9705 for Raft peers and 9605 for SQL clients, and spawns threads to process them. We'll look at Raft and SQL services separately. https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L66-L110 ## Raft Routing The heart of the server is the Raft processing thread `Server::raft_route()`. This is responsible for periodically ticking the Raft node via `raft::Node::tick()`, stepping inbound messages from Raft peers into the node via `raft::Node::step()`, and sending outbound messages to peers. It also takes inbound Raft client requests from the `sql::engine::Raft` SQL engine, steps them into the Raft node via `raft::Node::step()`, and passes responses back to the appropriate client as the node emits them. https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L169-L249 When the node starts up, it spawns a `Server::raft_send_peer()` thread for each Raft peer to send outbound messages to them. https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L84-L91 These threads continually attempt to connect to the peer via TCP, and then read any outbound `raft::Envelope(raft::Message)` messages from `Server::raft_route()` via a channel and writes the messages into the TCP connection using Bincode: https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L146-L167 The server also continually listens for inbound Raft TCP connections from peers in `Server::raft_accept()`: https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L112-L134 When an inbound connection is accepted, a `Server::raft_receive_peer()` thread is spawned that reads Bincode-encoded `raft::Envelope(raft::Message)` messages from the TCP connection and sends them to `Server::raft_route()` via a channel. https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L136-L144 The Raft cluster is now fully connected, and the nodes can all talk to each other. ## SQL Service Next, let's serve some SQL clients. The SQL service uses the enums `toydb::Request` and `toydb::Response` as a client protocol, again Bincode-encoded over TCP. The primary request type is `Request::Execute` which executes a SQL statement against a `sql::execution::Session` and returns a `sql::execution::StatementResult`, as we've seen previously. https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L312-L337 The server sets up a `sql::engine::Raft` SQL engine, with a Crossbeam channel that's used to send `raft::Request` Raft client requests to `Server::raft_route()` and onwards to the local `raft::Node`. It then spawns a `Server::sql_accept()` thread to listen for inbound SQL client connections: https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L104-L106 When a SQL client connection is accepted, a new client session `sql::execution::Session` is set up for the client, and we spawn a `Server::sql_session()` thread to serve the connection: https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L251-L272 These session threads continually read `Request` messages from the client, execute them against the SQL session (and ultimately the Raft node), before sending a `Response` back to the client. https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L274-L309 ## `toydb` Binary The `toydb` binary in `src/bin/toydb.rs` launches the server, and is a thin wrapper around `toydb::Server`. It is a tiny [`clap`](https://docs.rs/clap/latest/clap/) command: https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/bin/toydb.rs#L82-L89 It first parses a server configuration from the `toydb.yaml` file: https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/bin/toydb.rs#L30-L59 Then it initializes the Raft log storage and SQL state machine: https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/bin/toydb.rs#L105-L133 And finally it launches the `toydb::Server`: https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/bin/toydb.rs#L135-L137 toyDB is now up and running! ---

SQL Execution   |   Client

================================================ FILE: docs/architecture/sql-data.md ================================================ # SQL Data Model The SQL data model represents user data in tables and rows. It is made up of data types and schemas, in the [`sql::types`](https://github.com/erikgrinaker/toydb/tree/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/types) module. ## Data Types toyDB supports four basic scalar data types as `sql::types::DataType`: booleans, integers, floats, and strings. https://github.com/erikgrinaker/toydb/blob/b2fe7b76ee634ca6ad31616becabfddb1c03d34b/src/sql/types/value.rs#L15-L27 Specific values are represented as `sql::types::Value`, using the corresponding Rust types. toyDB also supports SQL `NULL` values, i.e. unknown values, following the rules of [three-valued logic](https://en.wikipedia.org/wiki/Three-valued_logic). https://github.com/erikgrinaker/toydb/blob/b2fe7b76ee634ca6ad31616becabfddb1c03d34b/src/sql/types/value.rs#L40-L64 The `Value` type provides basic formatting, conversion, and mathematical operations. https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/types/value.rs#L68-L79 https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/types/value.rs#L164-L370 It also specifies comparison and ordering semantics, but these are subtly different from the SQL semantics. For example, in Rust code `Value::Null == Value::Null` yields `true`, while in SQL `NULL = NULL` yields `NULL`. This mismatch is necessary for the Rust code to properly detect and process `Null` values, and the desired SQL semantics are implemented during expression evaluation which we'll cover below. https://github.com/erikgrinaker/toydb/blob/b2fe7b76ee634ca6ad31616becabfddb1c03d34b/src/sql/types/value.rs#L91-L162 During execution, a row of values is represented as `sql::types::Row`, with multiple rows emitted via `sql::types::Rows` row iterators: https://github.com/erikgrinaker/toydb/blob/b2fe7b76ee634ca6ad31616becabfddb1c03d34b/src/sql/types/value.rs#L378-L388 ## Schemas toyDB schemas only support tables. There are no named indexes or constraints, and there's only a single unnamed database. Tables are represented by `sql::types::Table`: https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/types/schema.rs#L12-L25 A table is made up of a set of columns, represented by `sql::types::Column`. These support the data types described above, along with unique constraints, foreign keys, and secondary indexes. https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/types/schema.rs#L29-L53 The table name serves as a unique identifier, and can't be changed later. In fact, tables schemas are entirely static: they can only be created or dropped (there are no schema changes). Table schemas are stored in the catalog, represented by the `sql::engine::Catalog` trait. We'll revisit the implementation of this trait in the SQL storage section. https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/engine/engine.rs#L60-L79 Table schemas are validated when created via `Table::validate()`, which enforces invariants and internal consistency. It uses the catalog to look up information about other tables, e.g. that foreign key references point to a valid target column in a different table. https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/types/schema.rs#L98-L170 Table rows are validated via `Table::validate_row()`, which ensures that a `sql::types::Row` conforms to the schema (e.g. that value types match the column data types). It uses a `sql::engine::Transaction` to look up other rows in the database, e.g. to check for primary key conflicts (we'll get back to this later). https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/types/schema.rs#L172-L236 ## Expressions During SQL execution, we also have to model _expressions_, such as `1 + 2 * 3`. These are represented as values and operations on them, and can be nested as a tree to represent compound operations. https://github.com/erikgrinaker/toydb/blob/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/types/expression.rs#L11-L64 For example, the expression `1 + 2 * 3` (taking [precedence](https://en.wikipedia.org/wiki/Order_of_operations) into account) is represented as: ```rust // + // / \ // 1 * // / \ // 2 3 Expression::Add( Expression::Constant(Value::Integer(1)), Expression::Multiply( Expression::Constant(Value::Integer(2)), Expression::Constant(Value::Integer(3)), ), ) ``` An `Expression` can contain two kinds of values: constant values as `Expression::Constant(sql::types::Value)`, and dynamic values as `Expression::Column(usize)` column references. The latter will fetch a `sql::types::Value` from a `sql::types::Row` at the specified index during evaluation. We'll see later how the SQL parser and planner transforms text expression like `1 + 2 * 3` into an `Expression`, and how it resolves column names to row indexes like `price * 0.25` to `row[3] * 0.25`. Expressions are evaluated recursively via `Expression::evalute()`, given a `sql::types::Row` with input values for column references, and return a final `sql::types::Value` result: https://github.com/erikgrinaker/toydb/blob/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/types/expression.rs#L73-L208 Many of the comparison operations like `==` are implemented explicitly here instead of using `sql::types::Value` comparisons. This is where we implement the SQL semantics of special values like `NULL`, such that `NULL = NULL` yields `NULL` instead of `TRUE`. For mathematical operations however, we generally dispatch to these methods on `sql::types::Value`: https://github.com/erikgrinaker/toydb/blob/b2fe7b76ee634ca6ad31616becabfddb1c03d34b/src/sql/types/value.rs#L185-L295 Expression parsing and evaluation is tested via test scripts in [`sql/testscripts/expression`](https://github.com/erikgrinaker/toydb/tree/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/testscripts/expressions). ---

SQL Engine   |   SQL Storage

================================================ FILE: docs/architecture/sql-execution.md ================================================ # SQL Execution Now that the planner and optimizer have done all the hard work of figuring out how to execute a query, it's time to actually execute it. ## Plan Executor Plan execution is done by `sql::execution::Executor` in the [`sql::execution`](https://github.com/erikgrinaker/toydb/tree/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/execution) module, using a `sql::engine::Transaction` to access the SQL storage engine. https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/execution/executor.rs#L14-L49 The executor takes a `sql::planner::Plan` as input, and will return an `ExecutionResult` depending on the statement type. https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L331-L339 When executing the plan, the executor will branch off depending on the statement type: https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L57-L101 We'll focus on `SELECT` queries here, which are the most interesting. toyDB uses the iterator model (also known as the volcano model) for query execution. In the case of a `SELECT` query, the result is a row iterator, and pulling from this iterator by calling `next()` will drive the entire execution pipeline by recursively calling `next()` on the child nodes' row iterators. This maps very naturally onto Rust's iterators, and we leverage these to construct the execution pipeline as nested iterators. Execution itself is fairly straightforward, since we're just doing exactly what the planner tells us to do in the plan. We call `Executor::execute_node` recursively on each `sql::planner:Node`, starting with the root node. Each node returns a result row iterator that the parent node can pull its input rows from, process them, and output the resulting rows via its own row iterator (with the root node's iterator being returned to the caller): https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L103-L104 `Executor::execute_node()` will simply look at the type of `Node`, recursively call `Executor::execute_node()` on any child nodes, and then process the rows accordingly. https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L103-L212 We won't discuss every plan node in detail, but let's consider the movie plan we've looked at previously: ``` Select └─ Order: movies.released desc └─ Projection: movies.title, movies.released, genres.name as genre └─ HashJoin: inner on movies.genre_id = genres.id ├─ Scan: movies (released >= 2000) └─ Scan: genres ``` We'll recursively call `execute_node()` until we end up in the two `Scan` nodes. These simply call through to the SQL engine (either using Raft or local disk) via `Transaction::scan()`, passing in the scan predicate if any, and return the resulting row iterator: https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L203-L204 `HashJoin` will then join the output rows from the `movies` and `genres` iterators by using a hash join. This builds an in-memory table for `genres` and then iterates over `movies`, joining the rows: https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L128-L141 https://github.com/erikgrinaker/toydb/blob/889aef9f24c0fa4d58e314877fa17559a9f3d5d2/src/sql/execution/join.rs#L103-L183 The `Projection` node will simply evaluate the (trivial) column expressions using each joined row as input: https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L179-L186 And finally the `Order` node will sort the results (which requires buffering them all in memory): https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L173-L177 https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L298-L328 The output row iterator of `Order` is returned via `ExecutionResult::Select`, and the caller can now go ahead and pull the resulting rows from it. ## Session Management The entry point to the SQL engine is the `sql::execution::Session`, which represents a single user session. It is obtained via `sql::engine::Engine::session()`. https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/execution/session.rs#L14-L21 The session takes a series of raw SQL statement strings as input and parses them: https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/execution/session.rs#L29-L33 For each statement, it returns a result depending on the kind of statement: https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/execution/session.rs#L132-L148 The session itself performs transaction control. It handles `BEGIN`, `COMMIT`, and `ROLLBACK` statements, and modifies the transaction accordingly. https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/execution/session.rs#L34-L70 Any other statements are processed by the SQL planner, optimizer, and executor as we've seen in previous sections. https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/execution/session.rs#L77-L83 These statements are always executed using the session's current transaction. If there is no active transaction, the session will create a new, implicit transaction for each statement. https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/execution/session.rs#L87-L112 And with that, we have a fully functional SQL engine! ---

SQL Optimization   |   Server

================================================ FILE: docs/architecture/sql-optimizer.md ================================================ # SQL Optimization [Query optimization](https://en.wikipedia.org/wiki/Query_optimization) attempts to improve query performance and efficiency by altering the execution plan. This is a deep and complex field, and we can only scratch the surface here. toyDB's query optimizer is very basic -- it only has a handful of rudimentary heuristic optimizations to illustrate how the process works. Real-world optimizers use much more sophisticated methods, including statistical analysis, cost estimation, adaptive execution, etc. The optimizers are located in the [`sql::planner::optimizer`](https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs) module. An optimizer `sql::planner::Optimizer` just takes in a plan node `sql::planner::Node` (the root node in the plan), and returns an optimized node: https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L20-L25 Optimizations are always implemented as recursive node transformations. To help with this, `Node` has the helper methods `Node::transform` and `Node::transform_expressions` which recurse into a node or expression tree and call a given transformation closure on each node, as either [pre-order](https://en.wikipedia.org/wiki/Tree_traversal#Pre-order,_NLR) or [post-order](https://en.wikipedia.org/wiki/Tree_traversal#Post-order,_LRN) transforms: https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/plan.rs#L269-L371 A technique that's often useful during optimization is to convert expressions into [conjunctive normal form](https://en.wikipedia.org/wiki/Conjunctive_normal_form), i.e. "an AND of ORs". For example, the two following expressions are equivalent, but the latter is in conjunctive normal form (it's a chain of ANDs): ``` (a AND b) OR (c AND d) → (a OR c) AND (a OR d) AND (b OR c) AND (b OR d) ``` This is useful because we can often move each AND operand independently around in the plan tree and still get the same result -- we'll see this in action later. Expressions are converted into conjunctive normal form via `Expression::into_cnf`, which is implemented using [De Morgan's laws](https://en.wikipedia.org/wiki/De_Morgan%27s_laws): https://github.com/erikgrinaker/toydb/blob/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/types/expression.rs#L289-L351 We'll have a brief look at all of toyDB's optimizers, which are listed here in the order they're applied: https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L9-L18 Test scripts for the optimizers are in [`src/sql/testscripts/optimizers`](https://github.com/erikgrinaker/toydb/tree/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/testscripts/optimizers), and show how query plans evolve as each optimizer is applied. ## Constant Folding The `ConstantFolding` optimizer performs [constant folding](https://en.wikipedia.org/wiki/Constant_folding). This pre-evaluates constant expressions in the plan during planning, instead of evaluating them for every row during execution. https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L27-L30 For example, consider the query `SELECT 1 + 2 * 3 - foo FROM bar`. There is no point in re-evaluating `1 + 2 * 3` for every row in `bar`, because the result is always the same, so we can just evaluate this once during planning, transforming the expression into `7 - foo`. Concretely, this plan: ``` Select └─ Projection: 1 + 2 * 3 - bar.foo └─ Scan: bar ``` Should be transformed into this plan: ``` Select └─ Projection: 7 - bar.foo └─ Scan: bar ``` To do this, `ConstantFolding` simply checks whether an `Expression` tree contains an `Expression::Column` node -- if it doesn't, then it much be a constant expression (since that's the only dynamic value in an expression), and we can evaluate it with a `None` input row and replace the original expression node with an `Expression::Constant` node. This is done recursively for each plan node, and recursively for each expression node (so it does this both for `SELECT`, `WHERE`, `ORDER BY`, and all other parts of the query). Notably, it does a post-order expression transform, so it starts at the expression leaf nodes and attempts to transform each expression node as it moves back up the tree -- this allows it to iteratively evaluate constant parts as far as possible for each branch. https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L32-L56 Additionally, `ConstantFolding` also short-circuits logical expressions. For example, the expression `foo AND FALSE` will always be `FALSE`, regardless of what `foo` is, so we can replace it with `FALSE`: https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L58-L84 As the code comment mentions though, this doesn't fold optimally: it doesn't attempt to rearrange expressions, which would require knowledge of precedence rules. For example, `(1 + foo) - 2` could be folded into `foo - 1` by first rearranging it as `foo + (1 - 2)`, but we don't do this currently. ## Filter Pushdown The `FilterPushdown` optimizer attempts to push filter predicates as far down into the plan as possible, to reduce the number of rows each node has to process. https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L90-L95 Recall the `movies` query plan from the planning section: ``` Select └─ Order: movies.released desc └─ Projection: movies.title, movies.released, genres.name as genre └─ Filter: movies.released >= 2000 └─ NestedLoopJoin: inner on movies.genre_id = genres.id ├─ Scan: movies └─ Scan: genres ``` Even though we're filtering on `release >= 2000`, the `Scan` node still has to read all of them from disk and send them via Raft, and the `NestedLoopJoin` node still has to join all of them. It would be nice if we could push this filtering into the `NestedLoopJoin` and `Scan` nodes and avoid this extra work, and this is exactly what `FilterPushdown` does. The only plan nodes that have predicates that can be pushed down are `Filter` nodes and `NestedLoopJoin` nodes, so we recurse through the plan tree and look for these nodes, attempting to push down. https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L97-L110 When it encounters the `Filter` node, it will extract the predicate and attempt to push it down into its `source` node: https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L139-L153 If the source node is a `Filter`, `NestedLoopJoin`, or `Scan` node, then we can push the predicate down into it by `AND`ing it with the existing predicate (if any). https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L112-L137 In our case, we were able to push the `Filter` into the `NestedLoopJoin`, and our plan now looks like this: ``` Select └─ Order: movies.released desc └─ Projection: movies.title, movies.released, genres.name as genre └─ NestedLoopJoin: inner on movies.genre_id = genres.id AND movies.released >= 2000 ├─ Scan: movies └─ Scan: genres ``` But we're still not done, as we'd like to push `movies.released >= 2000` down into the `Scan` node. Pushdown for join nodes is a little more tricky, because we can only push down parts of the expression that reference one of the source nodes. We first have to convert the expression into conjunctive normal form, i.e. and AND of ORs, as we've discussed previously. This allows us to examine and push down each AND part in isolation, because it has the same effect regardless of whether it is evaluated in the `NestedLoopJoin` node or one of the source nodes. Our expression is already in conjunctive normal form, though. We then look at each AND part, and check which side of the join it has column references for. If it only references one of the sides, then the expression can be pushed down into it. We also make some effort here to move primary/foreign key constants across to both sides, but we'll gloss over that. https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L155-L247 This allows us to push down the `movies.released >= 2000` predicate into the corresponding `Scan` node, significantly reducing the amount of data transferred across Raft: ``` Select └─ Order: movies.released desc └─ Projection: movies.title, movies.released, genres.name as genre └─ NestedLoopJoin: inner on movies.genre_id = genres.id ├─ Scan: movies (released >= 2000) └─ Scan: genres ``` ## Index Lookups The `IndexLookup` optimizer uses primary key or secondary index lookups instead of full table scans where possible. https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L250-L252 The optimizer itself is fairly straightforward. It assumes that `FilterPushdown` has already pushed predicates down into `Scan` nodes, so it only needs to examine these. It converts the predicate into conjunctive normal form, and looks for any parts that are direct column lookups -- i.e. `column = value` (possibly a long OR chain of these). If it finds any, and the column is either a primary key or secondary index column, then we convert the `Scan` node into either a `KeyLookup` or `IndexLookup` node respectively. If there are any further AND predicates remaining, we add a parent `Filter` node to keep these predicates. For example, the following plan: ``` Select └─ Scan: movies ((id = 1 OR id = 7 OR id = 3) AND released >= 2000) ``` Will be transformed into one that does individual key lookups rather than a full table scan: ``` Select └─ Filter: movies.released >= 2000 └─ KeyLookup: movies (1, 3, 7) ``` The code is as outlined above: https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L254-L303 Helped by `Expression::is_column_lookup()` and `Expression::into_column_values()`: https://github.com/erikgrinaker/toydb/blob/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/types/expression.rs#L363-L421 ## Hash Join The `HashJoin` optimizer will replace a `NestedLoopJoin` with a `HashJoin` where possible. https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L305-L307 A [nested loop join](https://en.wikipedia.org/wiki/Nested_loop_join) is a very inefficient O(n²) algorithm, which iterates over all rows in the right source for each row in the left source to see if they match. However, it is completely general, and can join on arbitraily complex predicates. In the common case where the join predicate is an equality comparison such as `movies.genre_id = genres.id` (i.e. an [equijoin](https://en.wikipedia.org/wiki/Relational_algebra#θ-join_and_equijoin)), then we can instead use a [hash join](https://en.wikipedia.org/wiki/Hash_join). This scans the right table once, builds an in-memory hash table from it, and for each left row it looks up any right rows in the hash table. This is a much more efficient O(n) algorithm. In our previous movie example, we are in fact doing an equijoin: ``` Select └─ Order: movies.released desc └─ Projection: movies.title, movies.released, genres.name as genre └─ NestedLoopJoin: inner on movies.genre_id = genres.id ├─ Scan: movies (released >= 2000) └─ Scan: genres ``` And so our `NestedLoopJoin` can be replaced by a `HashJoin`: ``` Select └─ Order: movies.released desc └─ Projection: movies.title, movies.released, genres.name as genre └─ HashJoin: inner on movies.genre_id = genres.id ├─ Scan: movies (released >= 2000) └─ Scan: genres ``` The `HashJoin` optimizer is extremely simple: if the join predicate is an equijoin, use a hash join. This isn't always a good idea (the right source can be huge and we can run out of memory for the hash table), but we keep it simple. https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L309-L348 Of course there are many other join algorithms out there, and one of the harder problems in SQL optimization is how to efficiently perform large N-way multijoins. We don't attempt to tackle these problems here -- the `HashJoin` optimizer is just a very simple example of such join optimization. ## Short Circuiting The `ShortCircuit` optimizer tries to find nodes that can't possibly do any useful work, and either removes them from the plan, or replaces them with trivial nodes that don't do anything. It is kind of similar to the `ConstantFolding` optimizer in spirit, but works on plan nodes rather than expression nodes. https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L350-L354 For example, `Filter` nodes with a `TRUE` predicate won't actually filter anything: ``` Select └─ Filter: true └─ Scan: movies ``` So we can just remove them: ``` Select └─ Scan: movies ``` Similarly, `Filter` nodes with a `FALSE` predicate will never emit anything: ``` Select └─ Filter: false └─ Scan: movies ``` There's no point doing a scan in this case, so we can just replace it with a `Nothing` node that does no work and doesn't emit anything: ``` Select └─ Nothing ``` The optimizer tries to find a bunch of such patterns. This can also tidy up query plans a fair bit by removing unnecessary cruft. https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L356-L438 ---

SQL Planning   |   SQL Execution

================================================ FILE: docs/architecture/sql-parser.md ================================================ # SQL Parsing We finally arrive at SQL. The SQL parser is the first stage in processing SQL queries and statements, located in the [`sql::parser`](https://github.com/erikgrinaker/toydb/tree/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser) module. The SQL parser's job is to take a raw SQL string and turn it into a structured form that's more convenient to work with. In doing so, it will validate that the string is in fact valid SQL _syntax_. However, it doesn't know if the SQL statement actually makes sense -- it has no idea which tables or columns exist, what their data types are, and so on. That's the job of the planner, which we'll look at later. For example, let's say the parser is given the following SQL query: ```sql SELECT name, price, price * 25 / 100 AS vat FROM products JOIN categories ON products.category_id = categories.id WHERE categories.code = 'BLURAY' AND stock > 0 ORDER BY price DESC LIMIT 10 ``` It will generate a structure that looks something like this (in simplified syntax): ```rust // A SELECT statement. Statement::Select { // SELECT name, price, price * 25 / 100 AS vat select: [ (Column("name"), None), (Column("price"), None), ( Divide( Multiply(Column("price"), Integer(25)), Integer(100) ), Some("vat"), ), ] // FROM products JOIN categories ON products.category_id = categories.id from: [ Join { left: Table("products"), right: Table("categories"), type: Inner, predicate: Some( Equal( Column("products.category_id)", Column("categories.id"), ) ) } ] // WHERE categories.code = 'BLURAY' AND stock > 0 where: Some( And( Equal( Column("categories.code"), String("BLURAY"), ), GreaterThan( Column("stock"), Integer(0), ) ) ) // ORDER BY price DESC order: [ (Column("price"), Descending), ] // LIMIT 10 limit: Some(Integer(10)) } ``` Let's have a look at how this happens. ## Lexer We begin with the `sql::parser::Lexer`, which takes the raw SQL string and performs [lexical analysis](https://en.wikipedia.org/wiki/Lexical_analysis) to convert it into a sequence of tokens. These tokens are things like number, string, identifier, SQL keyword, and so on. This preprocessing is useful to deal with some of the "noise" of SQL text, such as whitespace, string quotes, identifier normalization, and so on. It also specifies which symbols and keywords are valid in our SQL queries. This makes the parser's life a lot easier. The lexer doesn't care about SQL structure at all, only that the individual pieces (tokens) of a string are well-formed. For example, the following input string: ``` 'foo' ) 3.14 SELECT + x ``` Will result in these tokens: ``` String("foo") CloseParen Number("3.14") Keyword(Select) Plus Ident("x") ``` Tokens and keywords are represented by the `sql::parser::Token` and `sql::parser::Keyword` enums respectively: https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/lexer.rs#L8-L47 https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/lexer.rs#L86-L155 The lexer takes an input string and emits tokens as an iterator: https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/lexer.rs#L311-L337 It does this by repeatedly attempting to scan the next token until it reaches the end of the string (or errors). It can determine the kind of token by looking at the first character: https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/lexer.rs#L358-L373 And then scan across the following characters as appropriate to generate a valid token. For example, this is how a quoted string (e.g. `'foo'`) is lexed into a `Token::String` (including handling of any escaped quotes inside the string): https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/lexer.rs#L435-L451 These tokens become the input to the parser. ## Abstract Syntax Tree The end result of the parsing process will be an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree) (AST), which is a structured representation of a SQL statement, located in the [`sql::parser::ast`](https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/ast.rs) module. The root of this tree is the `sql::parser::ast::Statement` enum, which represents all the different kinds of SQL statements that we support, along with their contents: https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/ast.rs#L6-L145 The nested tree structure is particularly apparent with expressions, which represent values and operations on them. For example, the expression `2 * 3 - 4 / 2`, which evaluates to the value `4`. We've seen in the data model section how such expressions are represented as `sql::types::Expression`, but before we get there we have to parse them. The parser has its own representation `sql::parser::ast::Expression` -- this is necessary e.g. because in the AST, we represent columns as names rather than numeric indexes (we don't know yet which columns exist or what their names are, we'll get to that during planning). https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/ast.rs#L147-L170 https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/ast.rs#L204-L234 For example, `2 * 3 - 4 / 2` is represented as: ```rust Expression::Operator(Operator::Subtract( // The left-hand operand of - Expression::Operator(Operator::Multiply( // The left-hand operand of * Expression::Literal(Literal::Integer(2)), // The right-hand operand of * Expression::Literal(Literal::Integer(3)), )), // The right-hand operand of - Expression::Operator(Operator::Divide( // The left-hand operand of / Expression::Literal(Literal::Integer(4)), // The right-hand operand of / Expression::Literal(Literal::Integer(2)), )), )) ``` ## Parser The parser, `sql::parser::Parser`, takes lexer tokens as input and builds an `ast::Statement` from them: https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L9-L32 We can determine the kind of statement we're parsing simply by looking at the first keyword: https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L109-L130 Let's see how a `SELECT` statement is parsed. The different clauses in a `SELECT` (e.g. `FROM`, `WHERE`, etc.) must always be given in a specific order, and they always begin with the appropriate keyword, so we can simply try to parse each clause in the expected order: https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L330-L342 Parsing each clause is also just a matter of parsing the expected parts in order. For example, the initial `SELECT` clause is just a comma-separated list of expressions with an optional alias: https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L344-L365 The `FROM` clause is a comma-separated list of table name, optionally joined with other tables: https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L367-L427 And the `WHERE` clause is just a predicate expression to filter by: https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L429-L435 Expression parsing is where this gets tricky, because we have to respect the rules of operator precedence and associativity. For example, according to mathematical order of operations (aka "PEMDAS") the expression `2 * 3 - 4 / 2` must be parsed as `(2 * 3) - (4 / 2)` which yields 4, not `2 * (3 - 4) / 2` which yields -1. toyDB does this using the [precedence climbing algorithm](https://en.wikipedia.org/wiki/Operator-precedence_parser#Precedence_climbing_method), which is a fairly simple and compact algorithm as far as these things go. In a nutshell, it will greedily and recursively group operators together as long as their precedence is the same or higher than that of the operators preceding them (hence "precedence climbing"). For example: ``` ----- ----- Precedence 2: * and / ------------- Precedence 1: - 2 * 3 - 4 / 2 ``` The algorithm is documented in more detail on `Parser::parse_expression()`: https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L501-L696 ---

SQL Raft Replication   |   SQL Planning

================================================ FILE: docs/architecture/sql-planner.md ================================================ # SQL Planning The SQL planner in the [`sql::planner`](https://github.com/erikgrinaker/toydb/tree/c64012e29c5712d6fe028d3d5375a98b8faea266/src/sql/planner) module takes a SQL statement AST from the parser and generates an execution plan for it. We won't actually execute it just yet though, only figure out how to execute it. ## Execution Plan A plan is represented by the `sql::planner::Plan` enum. The variant specifies the operation to execute (e.g. `SELECT`, `INSERT`, `UPDATE`, `DELETE`): https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/plan.rs#L15-L73 Below the root, the plan is typically made of up of a tree of nested `sql::planner::Node`. Each node emits a stream of SQL rows as output, and may take streams of input rows from child nodes. https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/plan.rs#L106-L175 Here is an example, taken from the `Plan` code comment above: ```sql SELECT title, released, genres.name AS genre FROM movies INNER JOIN genres ON movies.genre_id = genres.id WHERE released >= 2000 ORDER BY released ``` Which results in this query plan: ``` Select └─ Order: movies.released desc └─ Projection: movies.title, movies.released, genres.name as genre └─ Filter: movies.released >= 2000 └─ NestedLoopJoin: inner on movies.genre_id = genres.id ├─ Scan: movies └─ Scan: genres ``` Rows flow from the tree leaves to the root: 1. `Scan` nodes read rows from the tables `movies` and `genres`. 2. `NestedLoopJoin` joins the rows from `movies` and `genres`. 3. `Filter` discards rows with release dates older than 2000. 4. `Projection` picks out the requested column values from the rows. 5. `Order` sorts the rows by release date. 6. `Select` returns the final rows to the client. ## Scope and Name Resolution One of the main jobs of the planner is to resolve column names to column indexes in the input rows of each node. In the query example above, the `WHERE released >= 2000` filter may refer to a column `released` from either the joined `movies` table or the `genres` tables. The planner needs to figure out which table has a `released` column, and also figure out which column number in the `NestedLoopJoin` output rows corresponds to the `released` column (for example column number 2). This job is further complicated by the fact that many nodes can alias, reorder, or drop columns, and some nodes may also refer to columns that shouldn't be part of the result at all (for example, it's possible to `ORDER BY` a column that won't be output by a `SELECT` projection at all, but the `Order` node still needs access to the column data to sort by it). The planner uses a `sql::planner::Scope` to keep track of which column names are currently visible, and which column indexes they refer to. For each node the planner builds, starting from the leaves, it creates a new `Scope` that contains the currently visible columns, tracking how they are modified and rearranged by each node. https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L577-L610 When an AST expression refers to a column name, the planner can use `Scope::lookup_column()` to find out which column number the expression should take its input value from. https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L660-L686 ## Planner The planner itself is `sql:planner::Planner`. It uses a `sql::engine::Catalog` to look up information about tables and columns from storage. https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L12-L20 To build an execution plan, the planner first looks at the `ast::Statement` kind to determine what kind of plan to build: https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L28-L47 Let's build this `SELECT` plan from above: ```sql SELECT title, released, genres.name AS genre FROM movies INNER JOIN genres ON movies.genre_id = genres.id WHERE released >= 2000 ORDER BY released ``` Which should result in this plan: ``` Select └─ Order: movies.released desc └─ Projection: movies.title, movies.released, genres.name as genre └─ Filter: movies.released >= 2000 └─ NestedLoopJoin: inner on movies.genre_id = genres.id ├─ Scan: movies └─ Scan: genres ``` The planner is given the following (simplified) AST from the parser as input: ```rust // A SELECT statement. Statement::Select { // SELECT title, released, genres.name AS genre select: [ (Column("title"), None), (Column("released"), None), (Column("genres.name"), "genre"), ] // FROM movies INNER JOIN genres ON movies.genre_id = genres.id from: [ Join { left: Table("movies"), right: Table("genres"), type: Inner, predicate: Some( Equal( Column("movies.genre_id"), Column("genres.id"), ) ) } ] // WHERE released >= 2000 where: Some( GreaterThanOrEqual( Column("released"), Integer(2000), ) ) // ORDER BY released order: [ (Column("released"), Ascending), ] } ``` The first thing `Planner::build_select` does is to create an empty scope (which will track column names and indexes) and build the `FROM` clause which will generate the initial input rows: https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L170-L179 https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L283-L289 `Planner::build_from()` first encounters the `ast::From::Join` item, which joins `movies` and `genres`. This will build a `Node::NestedLoopJoin` plan node for the join, which is the simplest and most straightforward join algorithm -- it simply iterates over all rows in the `genres` table for every row in the `movies` table and emits the joined rows (we'll see how to optimize it with a better join algorithm later). https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L319-L344 It first recurses into `Planner::build_from()` to build each of the `ast::From::Table` nodes for each table. This will look up the table schemas in the catalog, add them to the current scope, and build a `Node::Scan` node which will emit all rows from each table. The `Node::Scan` nodes are placed into the `Node::NestedLoopJoin` above. https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L312-L317 While building the `Node::NestedLoopJoin`, it also needs to convert the join expression `movies.genre_id = genres.id` into a proper `sql::types::Expression`. This is done by `Planner::build_expression()`: https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L493-L568 Expression building is mostly a direct translation from an `ast::Expression` variant to a corresponding `sql::types::Expression` variant (for example from `ast::Expression::Operator(ast::Operator::Equal)` to `sql::types::Expression::Equal`). However, as mentioned earlier, `ast::Expression` contains column references by name, while `sql::types::Expression` contains column references as row indexes. This name resolution is done here, by looking up the column names in the scope: https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L521-L523 The expression we're building is the join predicate of `Node::NestedLoopJoin`, so it operates on joined rows containing all columns of `movies` then all columns of `genres`. It also operates on all combinations of joined rows (the [Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product)), and the purpose of the join predicate is to determine which joined rows to actually keep. For example, the full set of joined rows that are evaluated might be: | movies.id | movies.title | movies.released | movies.genre_id | genres.id | genres.name | |-----------|--------------|-----------------|-----------------|-----------|-------------| | 1 | Sicario | 2015 | 2 | 1 | Drama | | 2 | Sicario | 2015 | 2 | 2 | Action | | 3 | 21 Grams | 2003 | 1 | 1 | Drama | | 4 | 21 Grams | 2003 | 1 | 2 | Action | | 5 | Heat | 1995 | 2 | 1 | Drama | | 6 | Heat | 1995 | 2 | 2 | Action | The join predicate should pick out the rows where `movies.genre_id = genres.id`. The scope will reflect the column layout in the example above, and can resolve the column names to zero-based row indexes as `#3 = #4`, which will be the final built `Expression`. Now that we've built the `FROM` clause into a `Node::NestedLoopJoin` of two `Node::Scan` nodes, we move on to the `WHERE` clause. This simply builds the `WHERE` expression `released >= 2000`, like we've already seen with the join predicate, and creates a `Node::Filter` node which takes its input rows from the `Node::NestedLoopJoin` and filters them by the given expression. Again, the scope keeps track of which input columns we're getting from the join node and resolves the `released` column reference in the expression. https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L202-L206 We then build the `SELECT` clause, which emits the `title, released, genres.name AS genre` columns. This is just a list of expressions that are built in the current scope and placed into a `Node::Projection` (the expressions could be arbitrarily complex). However, we also have to make sure to update the scope with the final three columns that are output to subsequent nodes, taking into account the `genre` alias for the original `genres.name` column (we won't dwell on the "hidden columns" mentioned there -- they're not relevant for our query). https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L214-L234 Finally, we build the `ORDER BY` clause. Again, this just builds a trivial expression for `released` and places it into an `Node::Order` node which takes input rows from the `Node::Projection` and sorts them by the order expression. https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L245-L252 And that's it. The `Node::Order` is placed into the root `Plan::Select`, and we have our final plan. ``` Select └─ Order: movies.released desc └─ Projection: movies.title, movies.released, genres.name as genre └─ Filter: movies.released >= 2000 └─ NestedLoopJoin: inner on movies.genre_id = genres.id ├─ Scan: movies └─ Scan: genres ``` We'll see how to execute it soon, but first we should optimize it to see if we can make it run faster -- in particular, to see if we can avoid reading all movies from storage, and if we can do better than the very slow nested loop join. ---

SQL Parsing   |   SQL Optimization

================================================ FILE: docs/architecture/sql-raft.md ================================================ # SQL Raft Replication toyDB uses Raft to replicate SQL storage across a cluster of nodes (see the Raft section for details). All nodes will store a full copy of the SQL database, and the Raft leader will replicate writes across nodes and execute reads. Recall the Raft state machine interface `raft::State`: https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/state.rs#L4-L51 In toyDB, the state machine is just a `sql::engine::Local` storage engine with a thin wrapper: https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L278-L291 Raft will submit read and write commands to this state machine as binary `Vec` data, so we have to represent the methods of `sql::engine::Engine` as binary Raft commands. We do this as two enums, `sql::engine::raft::Read` and `sql::engine::raft::Write`, which we'll Bincode-encode: https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L16-L71 Notice that almost all requests include a `mvcc::TransactionState`. Most of the useful methods of `sql::engine::Engine` are on the `sql::engine::Transaction`, but unlike the `Local` engine, below Raft we can't hold on to a `Transaction` object in memory between each command -- nodes may restart and leadership may move, and we want client transactions to keep working despite this. Instead, we will use the client-supplied `mvcc::TransactionState` to reconstruct a `Transaction` for every command via `mvcc::Transaction::resume()` and call methods on it. When the state machine receives a write command, it decodes it as a `Write` and calls the appropriate `Local` method. The result is Bincode-encoded and returned to the caller, who knows what return type to expect for a given command. The state machine also keeps track of the Raft applied index of each command as a separate key in the key/value store. https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L346-L367 https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L306-L338 Similarly, read commands are decoded as a `Read` and the appropriate `Local` method is called: https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L369-L404 That's the state machine running below Raft. But how do we actually send these commands to Raft and receive results? That's handled by the `sql::engine::Raft` implementation, which uses a channel to send requests to the local Raft node (we'll see how this plumbing works in the server section): https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L80-L95 The channel takes a `raft::Request` containing binary Raft client requests and a return channel where the Raft node can send back a `raft::Response`. The Raft engine has a few convenience methods to send requests and receive responses, for both read and write requests: https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L114-L135 And the implementation of the `sql::engine::Engine` and `sql::engine::Transaction` traits simply send these requests via Raft: https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L194-L276 One thing to note here is that we don't support streaming data via Raft, so e.g. the `Transaction::scan` method will buffer the entire result in a `Vec`. With a full table scan, this will load the entire table into memory -- that's unfortunate, but we keep it simple. To summarize, this is what happens when `Transaction::insert()` is called to insert a row via Raft: 1. `sql::engine::raft::Transaction::insert()`: called to insert a row. 2. `sql::engine::raft::Write::Insert`: enum representation of the insert command. 3. `raft::Request::Write`: raft request containing the Bincode-encoded `Write::Insert` command. 4. `sql::engine::raft::Engine::tx`: sends the `Request::Write` and response channel to Raft. 5. `raft::Node::step()`: the `Request::Write` is given to Raft in a `Message::ClientRequest`. 6. Raft does its replication thing, and commits the command's log entry. 7. `raft::State::apply()`: the Bincode-encoded `Write::Insert` is passed to the state machine. 8. `sql::engine::raft::State::apply()`: decodes the command to a `Write::Insert`. 9. `sql::engine::raft::State::local`: contains the `Local` engine on each node. 10. `sql::engine::local::Engine::resume()`: called to obtain the SQL/MVCC transaction. 11. `sql::engine::local::Transaction::insert()`: the row is inserted to the local engine. 12. `raft::RawNode::tx`: the `Ok(())` result is sent as a Bincode-encoded `Message::ClientResponse`. 13. `sql::engine::raft::Transaction::insert()`: receives the result and returns it to the caller. The plumbing here will be covered in more details in the server section. ---

SQL Storage   |   SQL Parsing

================================================ FILE: docs/architecture/sql-storage.md ================================================ # SQL Storage The SQL storage engine, in the [`sql::engine`](https://github.com/erikgrinaker/toydb/tree/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/engine) module, stores tables and rows. toyDB has two SQL storage implementations: * `sql::engine::Local`: local storage using a `storage::Engine` key/value store. * `sql::engine::Raft`: Raft-replicated storage, using `Local` on each node below Raft. These implement the `sql::engine::Engine` trait, which specifies the SQL storage API. SQL execution can use either simple local storage or Raft-replicated storage -- toyDB itself always uses the Raft-replicated engine, but many tests use a local in-memory engine. The `sql::engine::Engine` trait is fully transactional, based on the `storage::MVCC` transaction engine discussed previously. As such, the trait just has a few methods that begin transactions -- the storage logic itself is implemented in the transaction, which we'll cover in next. The trait also has a `session()` method to start SQL sessions for query execution, which we'll revisit in the execution section. https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/engine/engine.rs#L9-L29 Here, we'll only look at the `Local` engine, and we'll discuss Raft replication afterwards. `Local` itself is just a thin wrapper around a `storage::MVCC` to create transactions: https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L50-L97 ## Key/Value Representation `Local` uses a `storage::Engine` key/value store to store SQL table schemas, table rows, and secondary index entries. But how do we represent these as keys and values? The keys are represented by the `sql::engine::Key` enum, and encoded using the Keycode encoding that we've discussed in the encoding section: https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L15-L31 The values are encoded using the Bincode encoding, where the value type is given by the key: * `Key::Table` → `sql::types::Table` (table schemas) * `Key::Index` → `BTreeSet` (indexed primary keys) * `Key::Row` → `sql::types::Row` (table rows) Recall that the Keycode encoding will store keys in sorted order. This means that all `Key::Table` entries come first, then all `Key::Index`, then all `Key::Row`. These are further grouped and sorted by their fields. For example, consider these SQL tables containing movies and genres, with a secondary index on `movies.genre_id` for fast lookups of movies with a given genre: ```sql CREATE TABLE genres ( id INTEGER PRIMARY KEY, name STRING NOT NULL ); CREATE TABLE movies ( id INTEGER PRIMARY KEY, title STRING NOT NULL, released INTEGER NOT NULL, genre_id INTEGER NOT NULL INDEX REFERENCES genres ); INSERT INTO genres VALUES (1, 'Drama'), (2, 'Action'); INSERT INTO movies VALUES (1, 'Sicario', 2015, 2), (2, '21 Grams', 2003, 1), (3, 'Heat', 1995, 2); ``` This would result in the following illustrated keys and values, in the given order: ``` /Table/genres → Table { name: "genres", primary_key: 0, columns: ... } /Table/movies → Table { name: "movies", primary_key: 0, columns: ... } /Index/movies/genre_id/Integer(1) → BTreeSet { Integer(2) } /Index/movies/genre_id/Integer(2) → BTreeSet { Integer(1), Integer(3) } /Row/genres/Integer(1) → Row { Integer(1), String("Action") } /Row/genres/Integer(2) → Row { Integer(2), String("Drama") } /Row/movies/Integer(1) → Row { Integer(1), String("Sicario"), Integer(2015), Integer(2) } /Row/movies/Integer(2) → Row { Integer(2), String("21 Grams"), Integer(2003), Integer(1) } /Row/movies/Integer(3) → Row { Integer(3), String("Heat"), Integer(1995), Integer(2) } ``` Thus, if we want to do a full table scan of the `movies` table, we just do a prefix scan of `/Row/movies/`. If we want to do a secondary index lookup of all movies with `genre_id = 2`, we fetch `/Index/movies/genre_id/Integer(2)` and find that movies with `id = {1,3}` have this genre. To help with prefix scans, the valid key prefixes are represented as `sql::engine::KeyPrefix`: https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L35-L48 For a look at the actual on-disk binary storage format, see the test scripts under [`src/sql/testscripts/writes`](https://github.com/erikgrinaker/toydb/tree/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/testscripts/writes), which output the logical and raw binary representation of write operations. ## Schema Catalog The `sql::engine::Catalog` trait is used to store table schemas, i.e. `sql::types::Table`. It has a handful of methods for creating, dropping and fetching tables (recall that toyDB does not support schema changes). The `Table::name` field is used as a unique table identifier throughout. https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/engine/engine.rs#L60-L79 The `Catalog` trait is also fully transactional, as it must be implemented on a transaction via the `type Transaction: Transaction + Catalog` trait bound on `sql::engine::Engine`. Creating a table is straightforward: insert a key/value pair with a Keycode-encoded `Key::Table` for the key and a Bincode-encoded `sql::types::Table` for the value. We first check that the table doesn't already exist, and validate the table schema using `Table::validate()`. https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L340-L347 Similarly, fetching and listing tables is straightforward: just key/value gets or scans using the appropriate keys. https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L390-L399 Dropping tables is a bit more involved, since we have to perform some validation and also delete the actual table rows and any secondary index entries, but it's not terribly complicated: https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L349-L388 ## Row Storage and Transactions The workhorse of the SQL storage engine is the `Transaction` trait, which provides [CRUD](https://en.wikipedia.org/wiki/Create,_read,_update_and_delete) operations (create, read, update, delete) on table rows and secondary index entries. For performance (especially with Raft), it operates on row batches rather than individual rows. https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/engine/engine.rs#L31-L58 The `Local::Transaction` implementation is just a wrapper around an MVCC transaction, and the commit/rollback methods just call straight through to it: https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L99-L102 https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L182-L192 To insert new rows into a table, we first have to perform some validation: check that the table exists and validate the rows against the table schema (including checking for e.g. primary key conflicts and foreign key references). We then store the rows as a key/value pairs, using a `Key::Row` with the table name and primary key value. And finally, we update secondary index entries (if any). https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L252-L268 Row updates are similar to inserts, but in the case of a primary key change we instead delete the old row and insert a new one, for simplicity. Secondary index updates also have to update both the old and new entries. https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L296-L337 Row deletions are also similar: validate that the deletion is safe (e.g. check that there are no foreign key references to it), then delete the `Key::Row` keys and any secondary index entries: https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L194-L246 To fetch rows by primary key, we simply call through to key/value gets using the appropriate `Key::Row`: https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L248-L250 https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L127-L133 Similarly, index lookups fetch a `Key::Index` for the indexed value, returning matching primary keys: https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L270-L273 https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L115-L125 Scanning table rows just performs a prefix scan with the appropriate `KeyPrefix::Row`, returning a row iterator. This can optionally also do row filtering via filter pushdowns, which we'll revisit when we look at the SQL optimizer. https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L275-L294 And with that, we can now store and retrieve SQL tables and rows on disk. Let's see how to replicate it across nodes via Raft. ---

SQL Data Model   |   SQL Raft Replication

================================================ FILE: docs/architecture/sql.md ================================================ # SQL Engine The SQL engine provides support for the SQL query language, and is the main database interface. It uses a key/value store for data storage, MVCC for transactions, and Raft for replication. The SQL engine itself consists of several distinct components that form a pipeline: > Client → Session → Lexer → Parser → Planner → Optimizer → Executor → Storage The SQL engine is located in the [`sql`](https://github.com/erikgrinaker/toydb/tree/b2fe7b76ee634ca6ad31616becabfddb1c03d34b/src/sql) module. We'll discuss each of the components in a bottom-up manner. The SQL engine is tested as a whole by test scripts under [`src/sql/testscripts`](https://github.com/erikgrinaker/toydb/tree/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/testscripts). These typically take a raw SQL string as input, execute them against an in-memory storage engine, and output the result along with intermediate state such as the query plan, storage operations, and binary key/value data. ---

Raft Consensus   |   SQL Data Model

================================================ FILE: docs/architecture/storage.md ================================================ # Storage Engine toyDB uses an embedded [key/value store](https://en.wikipedia.org/wiki/Key–value_database) for data storage, located in the [`storage`](https://github.com/erikgrinaker/toydb/tree/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/storage) module. This stores arbitrary keys and values as binary byte strings. The storage engine doesn't know or care what the keys and values contain -- we'll see later how the SQL data model, with tables and rows, is mapped onto this key/value structure. The storage engine supports simple set/get/delete operations on individual keys. It does not itself support transactions -- this is built on top, and we'll get back to it shortly. Keys are stored in sorted order. This allows range scans, where we can iterate over all key/value pairs between two specific keys, or with a specific key prefix. This will be needed by other components in the system, e.g. to scan all rows in a specific SQL table, to scan all versions of an MVCC key, to scan the tail of the Raft log, etc. The storage engine is pluggable: there are multiple implementations, and the user can choose which one to use in the config file. These implement the `storage::Engine` trait: https://github.com/erikgrinaker/toydb/blob/4804df254034c51f367d1380d389d80695cd7054/src/storage/engine.rs#L8-L58 Let's look at the existing storage engine implementations. ## `Memory` Storage Engine The simplest storage engine is the `storage::Memory` engine. This is a trivial implementation which stores data in memory using the Rust standard library's [`BTreeMap`](https://doc.rust-lang.org/std/collections/struct.BTreeMap.html), without persisting it to disk. It is primarily used for testing. Since this is just a wrapper around the `BTreeMap` we can include it in its entirety here: https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/memory.rs#L8-L77 ## `BitCask` Storage Engine The main storage engine is `storage::BitCask`. This is a very simple variant of [BitCask](https://riak.com/assets/bitcask-intro.pdf), used in the [Riak](https://riak.com/) database. It is kind of like the [LSM-tree](https://en.wikipedia.org/wiki/Log-structured_merge-tree)'s baby cousin. https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L15-L55 toyDB's BitCask implementation uses a single append-only log file for storage. To write a key/value pair, we simply append it to the file. To delete a key, we append a special tombstone value. When reading a key, the last entry for that key in the file is used. The file format for a key/value pair is simply: 1. The key length, as a big-endian `u32` (4 bytes). 2. The value length, as a big-endian `i32` (4 bytes). -1 if tombstone. 3. The binary key (n bytes). 4. The binary value (n bytes). For example, the key/value pair `foo=bar` would be written as follows (in hexadecimal): ``` keylen valuelen key value 00000003 00000003 666f6f 626172 ``` Because the data file is a simple log, we don't need a separate [write-ahead log](https://en.wikipedia.org/wiki/Write-ahead_logging) for crash recovery -- the data file _is_ the write-ahead log. To quickly look up key/value pairs when reading, we maintain an in-memory `KeyDir` index which maps a key to the latest value's position in the file. All keys must therefore fit in memory. https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L57-L65 We initially generate this index by scanning through the entire file when it is opened: https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L267-L332 To write a key, we append it to the file and update the `KeyDir`: https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L155-L159 https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L342-L366 To delete a key, we append a tombstone value instead: https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L122-L126 To read a value for a key, we look up the key's file location in the `KeyDir` index (if the key exists), and then read it from the file: https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L334-L340 The `KeyDir` uses an inner stdlib `BTreeMap` to keep track of keys. This allows range scans, where we iterate over a sorted set of keys between the range bounds, loading each key from the file: https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L144-L146 https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L207-L225 As keys are updated and deleted, we'll keep accumulating old versions in the log file. To remove these, the log file is compacted on startup. This writes out the latest value of every live key/value pair to a new file, and replaces the old file. The keys are written in sorted order, to make later scans faster. https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L172-L195 ---

Overview   |   Key/Value Encoding

================================================ FILE: docs/architecture.md ================================================ Moved to [`architecture/index.md`](architecture/index.md). ================================================ FILE: docs/crate/Cargo.toml ================================================ [package] name = "toydb" version = "1.0.1" description = "A simple distributed SQL database, built for education" authors = ["Erik Grinaker "] license = "Apache-2.0" homepage = "https://github.com/erikgrinaker/toydb" repository = "https://github.com/erikgrinaker/toydb" edition = "2024" ================================================ FILE: docs/crate/README.md ================================================ # toyDB toyDB is a distributed SQL database in Rust, built from scratch as an educational project. Main features: * Raft distributed consensus for linearizable state machine replication. * ACID transactions with MVCC-based snapshot isolation. * Pluggable storage engine with BitCask and in-memory backends. * Iterator-based query engine with heuristic optimization and time-travel support. * SQL interface including joins, aggregates, and transactions. toyDB is not distributed as a crate, see for more. This crate used to contain the [joydb](https://crates.io/crates/joydb) database. Thanks to Serhii Potapov for donating the crate name. ================================================ FILE: docs/crate/src/lib.rs ================================================ //! This crate is just a simple README.md placeholder. toydb is not intended to be used as a //! library, and is not distributed as a crate. See . ================================================ FILE: docs/examples.md ================================================ # SQL Examples The following examples demonstrate some of toyDB's SQL features. For more details, see the [SQL reference](sql.md). - [Setup](#setup) - [Creating Tables and Data](#creating-tables-and-data) - [Constraints and Referential Integrity](#constraints-and-referential-integrity) - [Basic SQL Queries](#basic-sql-queries) - [Expressions](#expressions) - [Joins](#joins) - [Explain](#explain) - [Aggregates](#aggregates) - [Transactions](#transactions) - [Time-Travel Queries](#time-travel-queries) ## Setup To start a five-node cluster on the local machine (requires a working [Rust compiler](https://www.rust-lang.org/tools/install)), run: ``` $ ./cluster/run.sh toydb2 19:06:28 [ INFO] Listening on 0.0.0.0:9602 (SQL) and 0.0.0.0:9702 (Raft) toydb2 19:06:28 [ERROR] Failed connecting to Raft peer 127.0.0.1:9705: Connection refused toydb5 19:06:28 [ INFO] Listening on 0.0.0.0:9605 (SQL) and 0.0.0.0:9705 (Raft) [...] toydb5 19:06:29 [ INFO] Voting for toydb-d in term 1 election toydb3 19:06:29 [ INFO] Voting for toydb-d in term 1 election toydb4 19:06:29 [ INFO] Won election for term 1, becoming leader ``` In a separate terminal, start a `toysql` client and check the server status: ``` $ cargo run --release --bin toysql Connected to toyDB node "toydb-a". Enter !help for instructions. toydb> !status Server: 5 (leader 4 in term 1 with 5 nodes) Raft log: 1 committed, 0 applied, 0.000 MB (hybrid storage) Node logs: 1:1 2:1 3:1 4:1 5:1 SQL txns: 0 active, 0 total (bitcask storage) ``` The cluster is shut down by pressing Ctrl-C. Data is saved under `clusters/toydb-?/data/`, delete the contents to start over. ## Creating Tables and Data As a basis for later examples, we'll create a small movie database. The following SQL statements can be pasted into `toysql`: ```sql CREATE TABLE genres ( id INTEGER PRIMARY KEY, name STRING NOT NULL ); INSERT INTO genres VALUES (1, 'Science Fiction'), (2, 'Action'), (3, 'Drama'), (4, 'Comedy'); CREATE TABLE studios ( id INTEGER PRIMARY KEY, name STRING NOT NULL ); INSERT INTO studios VALUES (1, 'Mosfilm'), (2, 'Lionsgate'), (3, 'StudioCanal'), (4, 'Warner Bros'), (5, 'Focus Features'); CREATE TABLE movies ( id INTEGER PRIMARY KEY, title STRING NOT NULL, studio_id INTEGER NOT NULL INDEX REFERENCES studios, genre_id INTEGER NOT NULL INDEX REFERENCES genres, released INTEGER NOT NULL, rating FLOAT ); INSERT INTO movies VALUES (1, 'Stalker', 1, 1, 1979, 8.2), (2, 'Sicario', 2, 2, 2015, 7.6), (3, 'Primer', 3, 1, 2004, 6.9), (4, 'Heat', 4, 2, 1995, 8.2), (5, 'The Fountain', 4, 1, 2006, 7.2), (6, 'Solaris', 1, 1, 1972, 8.1), (7, 'Gravity', 4, 1, 2013, 7.7), (8, '21 Grams', 5, 3, 2003, 7.7), (9, 'Birdman', 4, 4, 2014, 7.7), (10, 'Inception', 4, 1, 2010, 8.8), (11, 'Lost in Translation', 5, 4, 2003, 7.7), (12, 'Eternal Sunshine of the Spotless Mind', 5, 3, 2004, 8.3); ``` toyDB supports some basic datatypes, as well as primary keys, foreign keys, and column indexes. For more information on these, see the [SQL reference](sql.md). Schema changes such as `ALTER TABLE` are not supported, only `CREATE TABLE` and `DROP TABLE`. The tables can be inspected via the `!tables` and `!table` commands: ```sql toydb> !tables genres movies studios toydb> !table genres CREATE TABLE genres ( id INTEGER PRIMARY KEY, name STRING NOT NULL ) ``` ## Constraints and Referential Integrity Schemas enforce referential integrity and other constraints: ```sql toydb> DROP TABLE studios; Error: Table studios is referenced by table movies column studio_id toydb> DELETE FROM studios WHERE id = 1; Error: Primary key 1 is referenced by table movies column studio_id toydb> UPDATE movies SET id = 1; Error: Primary key 1 already exists for table movies toydb> INSERT INTO movies VALUES (13, 'Nebraska', 6, 3, 2013, 7.7); Error: Referenced primary key 6 in table studios does not exist toydb> INSERT INTO movies VALUES (13, 'Nebraska', NULL, 3, 2013, 7.7); Error: NULL value not allowed for column studio_id toydb> INSERT INTO movies VALUES (13, 'Nebraska', 'Unknown', 3, 2013, 7.7); Error: Invalid datatype STRING for INTEGER column studio_id ``` ## Basic SQL Queries Most basic SQL query functionality is supported: ```sql toydb> SELECT * FROM studios; 1|Mosfilm 2|Lionsgate 3|StudioCanal 4|Warner Bros 5|Focus Features toydb> SELECT title, rating FROM movies WHERE released >= 2000 ORDER BY rating DESC LIMIT 3; Inception|8.8 Eternal Sunshine of the Spotless Mind|8.3 Gravity|7.7 ``` Column headers can be enabled with `!headers on`: ```sql toydb> !headers on Headers enabled toydb> SELECT id, name AS genre FROM genres; id|genre 1|Science Fiction 2|Action 3|Drama 4|Comedy ``` ## Expressions All common mathematical operators are implemented: ```sql toydb> SELECT 1 + 2 * 3; 7 toydb> SELECT (1 + 2) * 4 / -3; -4 SELECT 3! + 7 % 4 - 2 ^ 3; 1 ``` 64-bit floating point arithmetic is also supported, including infinity and NaN: ```sql toydb> SELECT 3.14 * 2.718; 8.53452 toydb> SELECT 1.0 / 0.0; inf toydb> SELECT 1e10 ^ 8; 100000000000000000000000000000000000000000000000000000000000000000000000000000000 toydb> SELECT 1e10 ^ 8 / INFINITY, 1e10 ^ 1e10, INFINITY / INFINITY; 0|inf|NaN ``` And of course three-valued logic: ```sql toydb> SELECT TRUE AND TRUE, TRUE AND FALSE, TRUE AND NULL, FALSE AND NULL; TRUE|FALSE|NULL|FALSE toydb> SELECT TRUE OR FALSE, FALSE OR FALSE, TRUE OR NULL, FALSE OR NULL; TRUE|FALSE|TRUE|NULL toydb> SELECT NOT TRUE, NOT FALSE, NOT NULL; FALSE|TRUE|NULL ``` Which would be useless without comparison operators for all types: ```sql toydb> SELECT 3 > 1, 3 <= 1, 3 = 3.0; TRUE|FALSE|TRUE toydb> SELECT 'a' = 'A', 'foo' > 'bar', '👍' != '👎'; FALSE|TRUE|TRUE toydb> SELECT INFINITY > -INFINITY, NULL = NULL; TRUE|NULL ``` ## Joins No SQL database would be complete without joins, and toyDB supports most join types such as inner joins (both implicit and explicit): ```sql toydb> SELECT m.id, m.title, g.name FROM movies m JOIN genres g ON m.genre_id = g.id LIMIT 4; 1|Stalker|Science Fiction 2|Sicario|Action 3|Primer|Science Fiction 4|Heat|Action toydb> SELECT m.id, m.title, g.name FROM movies m, genres g WHERE m.genre_id = g.id LIMIT 4; 1|Stalker|Science Fiction 2|Sicario|Action 3|Primer|Science Fiction 4|Heat|Action ``` Left and right outer joins: ```sql toydb> SELECT s.id, s.name, g.name FROM studios s LEFT JOIN genres g ON s.id = g.id; 1|Mosfilm|Science Fiction 2|Lionsgate|Action 3|StudioCanal|Drama 4|Warner Bros|Comedy 5|Focus Features|NULL toydb> SELECT g.id, g.name, s.name FROM genres g RIGHT JOIN studios s ON g.id = s.id; 1|Science Fiction|Mosfilm 2|Action|Lionsgate 3|Drama|StudioCanal 4|Comedy|Warner Bros NULL|NULL|Focus Features ``` And cross joins (both implicit and explicit): ```sql toydb> SELECT g.name, s.name FROM genres g, studios s WHERE s.name < 'S'; Science Fiction|Mosfilm Science Fiction|Lionsgate Science Fiction|Focus Features Action|Mosfilm Action|Lionsgate Action|Focus Features Drama|Mosfilm Drama|Lionsgate Drama|Focus Features Comedy|Mosfilm Comedy|Lionsgate Comedy|Focus Features ``` We can join on arbitrary predicates, such as joining movies with any genres whose name is ordered after the movie's title: ```sql toydb> SELECT m.title, g.name FROM movies m JOIN genres g ON g.name > m.title ORDER BY m.title, g.name; 21 Grams|Action 21 Grams|Comedy 21 Grams|Drama 21 Grams|Science Fiction Birdman|Comedy Birdman|Drama Birdman|Science Fiction Eternal Sunshine of the Spotless Mind|Science Fiction Gravity|Science Fiction Heat|Science Fiction Inception|Science Fiction Lost in Translation|Science Fiction Primer|Science Fiction ``` And we can join multiple tables, even using the same table multiple times - like in this example where we find all science fiction movies released since 2000 by studios that have released any movie rated 8 or higher: ```sql toydb> SELECT m.id, m.title, g.name AS genre, m.released, s.name AS studio FROM movies m JOIN genres g ON m.genre_id = g.id, studios s JOIN movies good ON good.studio_id = s.id AND good.rating >= 8 WHERE m.studio_id = s.id AND m.released >= 2000 AND g.id = 1 ORDER BY m.title ASC; 7|Gravity|Science Fiction|2013|Warner Bros 10|Inception|Science Fiction|2010|Warner Bros 5|The Fountain|Science Fiction|2006|Warner Bros ``` ## Explain When optimizing complex queries with several joins, it can often be useful to inspect the query plan via an `EXPLAIN` query: ```sql toydb> EXPLAIN SELECT m.id, m.title, g.name AS genre, m.released, s.name AS studio FROM movies m JOIN genres g ON m.genre_id = g.id, studios s JOIN movies good ON good.studio_id = s.id AND good.rating >= 8 WHERE m.studio_id = s.id AND m.released >= 2000 AND g.id = 1 ORDER BY m.title ASC; Order: m.title asc └─ Projection: m.id, m.title, g.name, m.released, s.name └─ HashJoin: inner on m.studio_id = s.id ├─ HashJoin: inner on m.genre_id = g.id │ ├─ Filter: m.released > 2000 OR m.released = 2000 │ │ └─ IndexLookup: movies as m column genre_id (1) │ └─ KeyLookup: genres as g (1) └─ HashJoin: inner on s.id = good.studio_id ├─ Scan: studios as s └─ Scan: movies as good (good.rating > 8 OR good.rating = 8) ``` Here, we can see that the planner does a primary key lookup on `genres` and an index lookup on `movies.genre_id`, filtering the resulting movies by release year and joining them. It also does full table scans of `studios` and `movies` (to find the good movies) and joins them, pusing the `rating >= 8` filter down to the `movies` table scan. The results of these two joins are also joined to produce the final result, which is then formatted and sorted. ## Aggregates Most basic aggregate functions are supported: ```sql toydb> SELECT COUNT(*), MIN(rating), MAX(rating), AVG(rating), SUM(rating) FROM movies; 12|6.9|8.8|7.841666666666668|94.10000000000001 ``` We can group by values and filter the aggregate results: ```sql toydb> SELECT s.id, s.name, AVG(m.rating) AS average FROM movies m JOIN studios s ON m.studio_id = s.id GROUP BY s.id, s.name HAVING average > 7.8 ORDER BY average DESC, s.name ASC; 1|Mosfilm|8.149999999999999 4|Warner Bros|7.919999999999999 5|Focus Features|7.900000000000001 ``` And we can combine aggregate functions with arbitrary expressions, both inside and outside: ```sql toydb> SELECT s.id, s.name, ((MAX(rating^2) - MIN(rating^2)) / AVG(rating^2)) ^ (0.5) AS spread FROM movies m JOIN studios s ON m.studio_id = s.id GROUP BY s.id, s.name HAVING MAX(rating) - MIN(rating) > 0.5 ORDER BY spread DESC; 4|Warner Bros|0.6373540990222496 5|Focus Features|0.39194971607693424 ``` ## Transactions toyDB supports ACID transactions via MVCC-based snapshot isolation. This provides atomic transactions with good isolation, without taking out locks or blocking reads on writes. As a basic example, the below transaction is rolled back without taking effect, as opposed to `COMMIT` which would make it permanent: ```sql toydb> BEGIN; Began transaction 131 toydb:131> INSERT INTO genres VALUES (5, 'Western'); toydb:131> SELECT * FROM genres; 1|Science Fiction 2|Action 3|Drama 4|Comedy 5|Western toydb:131> ROLLBACK; Rolled back transaction 131 toydb> SELECT * FROM genres; 1|Science Fiction 2|Action 3|Drama 4|Comedy ``` We'll demonstrate transactions by covering most common transaction anomalies given two concurrent sessions, and show how toyDB prevents these anomalies in all cases but one. In these examples, the left half is user A and the right is user B. Time flows downwards such that commands on the same line happen at the same time. **Dirty write:** an uncommitted write by A should not be affected by a concurrent B write. ```sql a> BEGIN; a> INSERT INTO genres VALUES (5, 'Western'); b> INSERT INTO genres VALUES (5, 'Romance'); Error: Serialization failure, retry transaction a> SELECT * FROM genres WHERE id = 5; 5|Western ``` The serialization failure here occurs because the first write always wins. This may not be an optimal strategy, but it is correct in terms of preventing serialization anomalies. **Dirty read:** an uncommitted write by A should not be visible to B until committed. ```sql a> BEGIN; a> INSERT INTO genres VALUES (5, 'Western'); b> SELECT * FROM genres WHERE id = 5; No rows returned a> COMMIT; b> SELECT * FROM genres WHERE id = 5; 5|Western ``` **Lost update:** when A and B both read a value, before updating it in turn, the first write should not be overwritten by the second. ```sql a> BEGIN; b> BEGIN; a> SELECT title, rating FROM movies WHERE id = 2; b> SELECT title, rating FROM movies WHERE id = 2; Sicario|7.6 Sicario|7.6 a> UPDATE movies SET rating = 7.8 WHERE id = 2; b> UPDATE movies SET rating = 7.7 WHERE id = 2; Error: Serialization failure, retry transaction a> COMMIT; ``` **Fuzzy read:** B should not see a value suddenly change in its transaction, even if A commits a new value. ```sql a> BEGIN; b> BEGIN; b> SELECT * FROM genres WHERE id = 1; 1|Science Fiction a> UPDATE genres SET name = 'Scifi' WHERE id = 1; a> COMMIT; b> SELECT * FROM genres WHERE id = 1; 1|Science Fiction b> COMMIT; b> SELECT * FROM genres WHERE id = 1; 1|Scifi ``` **Read skew:** if A reads two values, and B modifies the second value in between the reads, A should see the old second value. ```sql a> BEGIN; a> SELECT * FROM genres WHERE id = 2; 2|Action b> BEGIN; b> UPDATE genres SET name = 'Drama' WHERE id = 2; b> UPDATE genres SET name = 'Action' WHERE id = 3; b> COMMIT; a> SELECT * FROM genres WHERE id = 3; 3|Drama ``` **Phantom read:** when A runs a query with a predicate, and B commits a matching write, A should not see the write when rerunning it. ```sql a> BEGIN; a> SELECT * FROM genres WHERE id > 2; 3|Drama 4|Comedy b> INSERT INTO genres VALUES (5, 'Western'); a> SELECT * FROM genres WHERE id > 2; 3|Drama 4|Comedy ``` **Write skew:** when A reads row X and writes it to row Y, B should not concurrently be able to read row Y and write it to row X. ```sql a> BEGIN; b> BEGIN; a> SELECT * FROM genres WHERE id = 2; 2|Action b> SELECT * FROM genres WHERE id = 3; 3|Drama b> UPDATE genres SET name = 'Drama' WHERE id = 2; a> UPDATE genres SET name = 'Action' WHERE id = 3; a> COMMIT; b> COMMIT; ``` Here, the writes actually go through. This anomaly is not protected against by snapshot isolation, and thus not by toyDB either - doing so would require implementing serializable snapshot isolation. However, this is the only common serialization anomaly not handled by toyDB, and is not among the most severe. ## Time-Travel Queries Since toyDB uses MVCC for transactions and keeps all historical versions, the state of the database can be queried at any arbitrary point in the past. toyDB uses incremental transaction IDs as logical timestamps: ```sql toydb> SELECT * FROM genres; 1|Science Fiction 2|Drama 3|Action 4|Comedy toydb> BEGIN; Began transaction 173 toydb:173> UPDATE genres SET name = 'Scifi' WHERE id = 1; toydb:173> INSERT INTO genres VALUES (5, 'Western'); toydb:173> COMMIT; Committed transaction 173 toydb> SELECT * FROM genres; 1|Scifi 2|Drama 3|Action 4|Comedy 5|Western toydb> BEGIN READ ONLY AS OF SYSTEM TIME 172; Began read-only transaction 175 in snapshot at version 172 toydb@172> SELECT * FROM genres; 1|Science Fiction 2|Drama 3|Action 4|Comedy ``` ================================================ FILE: docs/references.md ================================================ # References This is the main research material I used while building toyDB. It is a subset of my [reading list](https://github.com/erikgrinaker/readings). ## Introduction Andy Pavlo's CMU lectures are an absolutely fantastic introduction to database internals: - 🎥 [CMU 15-445 Intro to Database Systems](https://www.youtube.com/playlist?list=PLSE8ODhjZXjbohkNBWQs_otTrBTrjyohi) (A Pavlo 2019) - 🎥 [CMU 15-721 Advanced Database Systems](https://www.youtube.com/playlist?list=PLSE8ODhjZXjasmrEd2_Yi1deeE360zv5O) (A Pavlo 2020) Martin Kleppman has written an excellent overview of database technologies and concepts, while Alex Petrov goes in depth on implementation of storage engines and distributed systems algorithms: - 📖 [Designing Data-Intensive Applications](https://dataintensive.net/) (M Kleppmann 2017) - 📖 [Database Internals](https://www.databass.dev) (A Petrov 2019) ## Raft The Raft consensus algorithm is described in a very readable paper by Diego Ongaro, and in a talk given by his advisor John Ousterhout: - 📄 [In Search of an Understandable Consensus Algorithm](https://raft.github.io/raft.pdf) (D Ongaro, J Ousterhout 2014) - 🎥 [Designing for Understandability: The Raft Consensus Algorithm](https://www.youtube.com/watch?v=vYp4LYbnnW8) (J Ousterhout 2016) However, Raft has several subtle pitfalls, and Jon Gjengset's student guide was very helpful in drawing attention to these: - 🔗 [Students' Guide to Raft](https://thesquareplanet.com/blog/students-guide-to-raft/) (J Gjengset 2016) ## Parsing Thorsten Ball has written a very enjoyable hands-on introduction to parsers where he implements first an interpreter and then a compiler for the made-up Monkey programming language (in Go): - 📖 [Writing An Interpreter In Go](https://interpreterbook.com) (T Ball 2016) - 📖 [Writing A Compiler In Go](https://compilerbook.com) (T Ball 2018) The toyDB expression parser is inspired by a blog post by Eli Bendersky describing the precedence climbing algorithm, which is the algorithm I found the most elegant: - 💬 [Parsing Expressions by Precedence Climbing](https://eli.thegreenplace.net/2012/08/02/parsing-expressions-by-precedence-climbing) (E Bendersky 2012) ## Transactions Jepsen (i.e. Kyle Kingsbury) has an excellent overview of consistency and isolation models, which is very helpful in making sense of the jungle of overlapping and ill-defined terms: - 🔗 [Consistency Models](https://jepsen.io/consistency) (Jepsen 2016) For more background on this, in particular on how snapshot isolation provided by the MVCC transaction engine used in toyDB does not fit into the traditional SQL isolation levels, the following classic papers were useful: - 📄 [A Critique of ANSI SQL Isolation Levels](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-95-51.pdf) (H Berenson et al 1995) - 📄 [Generalized Isolation Level Definitions](http://pmg.csail.mit.edu/papers/icde00.pdf) (A Adya, B Liskov, P ONeil 2000) As for actually implementing MVCC, I found blog posts to be the most helpful: - 💬 [Implementing Your Own Transactions with MVCC](https://levelup.gitconnected.com/implementing-your-own-transactions-with-mvcc-bba11cab8e70) (E Chance 2015) - 💬 [How Postgres Makes Transactions Atomic](https://brandur.org/postgres-atomicity) (B Leach 2017) ================================================ FILE: docs/sql.md ================================================ # SQL Reference ## Data Types The following data types are supported: * `BOOLEAN` (`BOOL`): logical truth values, i.e. true and false. * `FLOAT` (`DOUBLE`): 64-bit signed floating point numbers, using [IEEE 754 `binary64`](https://en.wikipedia.org/wiki/binary64) encoding. Supports magnitudes of 10⁻³⁰⁷ to 10³⁰⁸ with 53-bit precision (~15 significant figures), as well as the special values infinity and NaN. * `INTEGER` (`INT`): 64-bit signed integer numbers with a range of ±2⁶³-1. * `STRING` (`TEXT`, `VARCHAR`): UTF-8 encoded strings. In addition, the special `NULL` value is used for an unknown value, following the rules of [three-valued logic](https://en.wikipedia.org/wiki/Three-valued_logic). Numeric types are not interchangable; a float value (even without a fractional part) cannot be stored in an integer column and vice-versa. ## SQL Syntax ### Keywords Keywords are reserved words with special meaning in SQL statements. They are case-insensitive, and must be quoted with `"` to be used as identifiers. The complete list is: `AS`, `ASC`, `AND`, `BEGIN`, `BOOL`, `BOOLEAN`, `BY`, `COMMIT`, `CREATE`, `CROSS`, `DEFAULT`,`DELETE`, `DESC`, `DOUBLE`, `DROP`, `EXISTS`, `EXPLAIN`, `FALSE`, `FLOAT`, `FROM`, `GROUP`, `HAVING`, `IF`, `INDEX`, `INFINITY`, `INNER`, `INSERT`, `INT`, `INTEGER`, `INTO`, `IS`, `JOIN`, `KEY`, `LEFT`, `LIKE`, `LIMIT`, `NAN`, `NOT`, `NULL`, `OF`, `OFFSET`, `ON`, `ONLY`, `OR`, `ORDER`, `OUTER`, `PRIMARY`, `READ`, `REFERENCES`, `RIGHT`, `ROLLBACK`, `SELECT`, `SET`, `STRING`, `SYSTEM`, `TABLE`, `TEXT`, `TIME`, `TRANSACTION`, `TRUE`, `UNIQUE`, `UPDATE`, `VALUES`, `VARCHAR`, `WHERE`, `WRITE` ### Identifiers Identifiers are names for database objects such as tables and columns. Unless quoted with `"`, they must begin with a Unicode letter followed by any combination of letters, numbers, and `_`, and cannot be reserved keywords. `""` can be used to escape a double quote character. They are always converted to lowercase. ### Constants #### Named constants The following keywords evaluate to constants: * `FALSE`: the boolean false value. * `INFINITY`: the floating-point value for infinity. * `NAN`: the floating-point value for NaN (not a number). * `NULL`: an unknown value. * `TRUE`: the boolean true value. #### String literals String literals are surrounded by single quotes `'`, and can contain any valid UTF-8 character. Single quotes must be escaped by an additional single quote, i.e. `''`, no other escape sequences are supported. For example: ``` 'A string with ''quotes'' and emojis 😀' ``` #### Numeric literals Sequences of digits `0-9` are parsed as a 64-bit signed integer. Numbers with decimal points or in scientific notation are parsed as 64-bit floating point numbers. The following pattern is supported: ``` 999[.[999]][e[+-]999] ``` The `-` prefix operator can be used to take negative numbers. ### Expressions Expressions can be used wherever a value is expected, e.g. as `SELECT` columns nd `INSERT` values. They are made up of constants, a column references, an operator invocations, and a function calls. Column references can either be unqualified, e.g. `name`, or prefixed with the relation identifier separated by `.`, e.g. `person.name`. Unqualified identifiers must be unambiguous. ## SQL Operators ### Logical operators Logical operators apply standard logic operations on boolean operands. * `AND`: the logical conjunction, e.g. `TRUE AND TRUE` yields `TRUE`. * `OR`: the logical disjunction, e.g. `TRUE OR FALSE` yields `TRUE`. * `NOT`: the logical negation, e.g. `NOT TRUE` yields `FALSE`. The complete truth tables are: | `AND` | `TRUE` | `FALSE` | `NULL` | |-------------|---------|---------|---------| | **`TRUE`** | `TRUE` | `FALSE` | `NULL` | | **`FALSE`** | `FALSE` | `FALSE` | `FALSE` | | **`NULL`** | `NULL` | `FALSE` | `NULL` | | `OR` | `TRUE` | `FALSE` | `NULL` | |-------------|--------|---------|--------| | **`TRUE`** | `TRUE` | `TRUE` | `TRUE` | | **`FALSE`** | `TRUE` | `FALSE` | `NULL` | | **`NULL`** | `TRUE` | `NULL` | `NULL` | | `NOT` | | |-------------|---------| | **`TRUE`** | `FALSE` | | **`FALSE`** | `TRUE` | | **`NULL`** | `NULL` | ### Comparison operators Comparison operators compare values of the same data type, and return `TRUE` if the comparison holds or `FALSE` otherwise. `INTEGER` and `FLOAT` values are interchangeable. `STRING` comparisons use the string's byte values, i.e. case-sensitive with `'B' < 'a'` due to their UTF-8 code points. `FALSE` is considered lesser than `TRUE`. Comparison with `NULL` always yields `NULL` (even `NULL = NULL`). Binary operators: * `=`: equality, e.g. `1 = 1` yields `TRUE`. * `!=`: inequality, e.g. `1 != 2` yields `TRUE`. * `>`: greater than, e.g. `2 > 1` yields `TRUE`. * `>=`: greater than or equal, e.g. `1 >= 1` yields `TRUE`. * `<`: lesser than, e.g. `1 < 2` yields `TRUE`. * `<=`: lesser than or equal, e.g. `1 <= 1` yields `TRUE`. Unary operators: * `IS NULL`: checks if the value is `NULL`, e.g. `NULL IS NULL` yields `TRUE`. * `IS NOT NULL`: checks if the value is not `NULL`, e.g. `TRUE IS NOT NULL` yields `TRUE`. * `IS NAN`: checks if the value is a float `NAN`, e.g. `NAN IS NAN` yields `TRUE`. Errors on non-float datatypes, except `NULL` which yields `NULL`. * `IS NOT NAN`: checks if the value is not a float `NAN`, e.g. `3.14 IS NOT NAN` yields `TRUE`. ### Mathematical operators Mathematical operators apply standard math operations on numeric (`INTEGER` or `FLOAT`) operands. If either operand is a `FLOAT`, both operands are converted to `FLOAT` and the result is a `FLOAT`. If either operand is `NULL`, the result is `NULL`. The special values `INFINITY` and `NAN` are handled according to the IEEE 754 spec. For `INTEGER` operands, failure conditions such as overflow and division by zero yield an error. For `FLOAT` operands, these return `INFINITY` or `NAN` as appropriate. Binary operators: * `+`: addition, e.g. `1 + 2` yields `3`. * `-`: subtraction, e.g. `3 - 2` yields `1`. * `*`: multiplication, e.g. `3 * 2` yields `6`. * `/`: division, e.g. `6 / 2` yields `3`. * `^`: exponentiation, e.g. `2 ^ 4` yields `16`. * `%`: remainder, e.g. `8 % 3` yields `2`. Unlike modulo, the result has the sign of the dividend. Unary operators: * `+` (prefix): identity, e.g. `+1` yields `1`. * `-` (prefix): negation, e.g. `- -2` yields `2`. * `!` (postfix): factorial, e.g. `5!` yields `15`. ### String operators String operators operate on string operands. * `LIKE`: compares a string with the given pattern, using `%` as multi-character wildcard and `_` as single-character wildcard, returning `TRUE` if the string matches the pattern - e.g. `'abc' LIKE 'a%'` yields `TRUE`. ### Operator precedence The operator precedence (order of operations) is as follows: | Precedence | Operator | Associativity | |------------|-------------------------|---------------| | 10 | `+`, `-` (prefix) | Right | | 9 | `!` (postfix) | Left | | 8 | `^` | Right | | 7 | `*`, `/`, `%` | Left | | 6 | `+`, `-` | Left | | 5 | `>`, `>=`, `<`, `<=` | Left | | 4 | `=`, `!=`, `LIKE`, `IS` | Left | | 3 | `NOT` | Right | | 2 | `AND` | Left | | 1 | `OR` | Left | Precedence can be overridden by wrapping an expression in parentheses, e.g. `(1 + 2) * 3`. ### Functions * `sqrt(expr)`: returns the square root of a numerical argument. ### Aggregate functions Aggregate function aggregate an expression across all rows, optionally grouped into buckets given by `GROUP BY`, and results can be filtered via `HAVING`. * `AVG(expr)`: returns the average of numerical values. * `COUNT(expr)`: returns the number of rows for which ***`expr`*** evaluates to a non-`NULL` value. `COUNT(*)` can be used to count all rows. * `MAX(expr)`: returns the maximum value, according to the datatype's ordering. * `MIN(expr)`: returns the minimum value, according to the datatype's ordering. * `SUM(expr)`: returns the sum of numerical values. ## SQL Statements ### `BEGIN` Starts a new [transaction](#transactions).
BEGIN [ TRANSACTION ] [ READ ONLY | READ WRITE ] [ AS OF SYSTEM TIME txn_id ]
* ***`txn_id`***: A past transaction ID to run a read-only transaction for, for time-travel queries. ### `COMMIT` Commits an active [transaction](#transactions). ### `CREATE TABLE` Creates a new table.
CREATE TABLE table_name (
    [ column_name data_type [ column_constraint [ ... ] ]  [ INDEX ] [, ... ] ]
)

where column_constraint is:

{ NOT NULL | NULL | PRIMARY KEY | DEFAULT expr | REFERENCES ref_table | UNIQUE }
* ***`table_name`***: The name of the table. Must be a [valid identifier](#identifiers). Errors if a table with this name already exists. * ***`column_name`***: The name of the column. Must be a [valid identifier](#identifiers), and unique within the table. * ***`data_type`***: The data type of the column, see [data types](#data-types) for valid types. * `NOT NULL`: The column may not contain `NULL` values. * `NULL`: The column may contain `NULL` values. This is the default. * `PRIMARY KEY`: The column should act as a primary key, i.e. the main row identifier. A table must have exactly one primary key column, and it must be unique and non-nullable. * `DEFAULT`***`expr`***: Specifies a default value for the column when `INSERT` statements do not give a value. ***`expr`*** can be any constant expression of an appropriate data type, e.g. `'abc'` or `1 + 2 * 3`. For nullable columns, the default value is `NULL` unless specified otherwise. * `REFERENCES`***`ref_table`***: The column is a foreign key to ***`ref_table`***'s primary key, enforcing referential integrity. * `UNIQUE`: The column may only contain unique (distinct) values. `NULL` values are not considered equal, thus a `UNIQUE` column which allows `NULL` may contain multiple `NULL` values. `PRIMARY KEY` columns are implicitly `UNIQUE`. * `INDEX`: Create an index for the column. #### Example ```sql CREATE TABLE movie ( id INTEGER PRIMARY KEY, title STRING NOT NULL, release_year INTEGER INDEX, imdb_id STRING INDEX UNIQUE, bluray BOOLEAN NOT NULL DEFAULT TRUE ) ``` ### `DELETE` Deletes rows in a table.
DELETE FROM table_name
    [ WHERE predicate ]
Deletes rows where ***`predicate`*** evaluates to `TRUE`, or all rows if no `WHERE` clause is given. * ***`table_name`***: the table to delete from. Errors if it does not exist. * ***`predicate`***: an expression which determines which rows to delete by evaluting to `TRUE`. Must evaluate to a `BOOLEAN` or `NULL`, otherwise an error is returned. #### Example ```sql DELETE FROM movie WHERE release_year < 2000 AND bluray = FALSE ``` ### `DROP TABLE` Deletes a table and all contained data. Errors if the table does not exist, unless `IF EXISTS` is given.
DROP TABLE [ IF EXISTS ] table_name
* ***`table_name`***: the table to delete. ### `EXPLAIN` Outputs the execution plan for the given statement.
EXPLAIN [ statement ]
### `INSERT` Inserts rows into a table.
INSERT INTO table_name
    [ ( column_name [, ... ] ) ]
    VALUES ( expression [, ... ] ) [, ... ]
If column names are given, an identical number of values must be given. If no column names are given, values must be given in the table's column order. Omitted columns will get a default value if specified, otherwise an error will be returned. * ***`table_name`***: the table to insert into. Errors if it does not exist. * ***`column_name`***: a column to insert into in the given table. Errors if it does not exist. * ***`expression`***: an expression to insert into the corresponding column. Must be a constant expression, i.e. it cannot refer to table columns. #### Example ```sql INSERT INTO movie (id, title, release_year) VALUES (1, 'Sicario', 2015), (2, 'Stalker', 1979), (3, 'Her', 2013) ``` ### `ROLLBACK` Rolls back an active [transaction](#transactions). ### `SELECT` Selects rows from a table.
SELECT [ * | expression [ [ AS ] output_name [, ...] ] ]
    [ FROM from_item [, ...] ]
    [ WHERE predicate ]
    [ GROUP BY group_expr [, ...] ]
    [ HAVING having_expr ]
    [ ORDER BY order_expr [ ASC | DESC ] [, ...] ]
    [ LIMIT count ]
    [ OFFSET start ]

where from_item is one of:

table_name [ [ AS ] alias ]
from_item join_type from_item [ ON join_predicate ]

where join_type is one of:

CROSS JOIN
[ INNER ] JOIN
LEFT [ OUTER ] JOIN
RIGHT [ OUTER ] JOIN

Fetches rows or expressions, either from table ***`table_name`*** (if given) or generated. * ***`expression`***: [expression](#expressions) to fetch (can be a simple column name). * ***`output_name`***: output column [identifier](#identifier), defaults to column name (if single column) otherwise nothing (displayed as `?`). * ***`table_name`***: table to fetch rows from. * ***`alias`***: table alias. * ***`predicate`***: only return rows for which this [expression](#expressions) evaluates to `TRUE`. * ***`group_expr`***: an expression to group aggregates by. Non-aggregate `SELECT` expressions must either reference a column given in `group_expr`, be idential with a `group_expr`, or have an `output_name` that is referenced by a `group_expr` column. * ***`having_expr`***: only return aggregate results for which this [expression](#expressions) evaluates to `TRUE`. * ***`order_expr`***: order rows by this expression (can be a simple column name). * ***`count`***: maximum number of rows to return. Must be a constant integer expression. * ***`start`***: number of rows to skip. Must be a constant integer expression. * ***`join_predicate`***: only return rows for which this [expression](#expressions) evaluates to `TRUE`. Join types: * `CROSS JOIN`: returns the Carthesian product of the joined tables. Does not accept a join predicate (`ON` clause). * `INNER JOIN`: returns the rows of the tables' Carthesian product for which ***`join_predicate`*** evaluates to `TRUE`. * `LEFT OUTER JOIN`: returns the rows joined on the ***`join_predicate`***, or for any rows in the left table that does not have a match in the right table a single row is returned with the right table's columns set to `NULL`. * `RIGHT OUTER JOIN`: the same as a `LEFT OUTER JOIN` but with the left and right tables switched. #### Example ```sql SELECT id, title, 2020 - released AS age FROM movies WHERE released >= 2000 AND ultrahd ORDER BY released DESC, title ASC LIMIT 10 OFFSET 10 ``` ### `UPDATE` Updates rows in a table.
UPDATE table_name
    SET column_name = expression | DEFAULT [, ... ]
    [ WHERE predicate ]
Updates columns given by ***`column_name`*** to the corresponding ***`expression`*** for all rows where ***`predicate`*** evaluates to `TRUE`. If no `WHERE` clause is given, all rows are updated. * ***`table_name`***: the table to update. Errors if it does not exist. * ***`column_name`***: a column to update. Errors if it does not exist. * ***`expression`***: an expression whose evaluated value will be set for the corresponding column and row. Expressions can refer to column values, and must evaluate to the same datatype as the updated column. Using `DEFAULT` will set the column's default value, if any. * ***`predicate`***: an expression which determines which rows to update by evaluting to `TRUE`. Must evaluate to a `BOOLEAN` or `NULL`, otherwise an error is returned. #### Example ```sql UPDATE movie SET bluray = TRUE WHERE release_year >= 2000 AND bluray = FALSE ``` ## Transactions toyDB supports ACID transactions using MVCC-based snapshot isolation, protecting from the following anomalies: dirty writes, dirty reads, lost updates, fuzzy reads, read skew, and phantom reads. However, write skew anomalies are possible since serializable snapshot isolation is not implemented. A new transaction is started with `BEGIN`, and ended with either `COMMIT` (atomically writing all changes) or `ROLLBACK` (discarding all changes). If any conflicts occur between concurrent transactions, the lowest transaction ID wins and the others will fail with a serialization error and must retry. All past data is versioned and retained, and can be queried as of a given transaction ID via `BEGIN TRANSACTION READ ONLY AS OF SYSTEM TIME `. A transaction is still valid for use if a contained statement returns an error. It is up to the client to take appropriate action. ================================================ FILE: docs/tools/update-links.py ================================================ #!/usr/bin/env python3 # # Updates GitHub code links to the latest commit SHA. import os, re, sys, argparse import requests GITHUB_API = "https://api.github.com" def get_latest_sha(owner, repo, path, token): url = f"{GITHUB_API}/repos/{owner}/{repo}/commits" headers = {} if token: headers["Authorization"] = f"token {token}" params = {"path": path, "sha": "main", "per_page": 1} resp = requests.get(url, headers=headers, params=params) resp.raise_for_status() data = resp.json() return data[0]["sha"] if data else None def process_markdown(text, token): pattern = re.compile( r"https://github\.com/(?P[^/]+)/(?P[^/]+)/blob/" r"(?P[0-9a-f]{7,40})/(?P[^#)\s]+)" ) cache = {} def replacer(m): print(f"Checking {m.group(0)}") owner, repo, oldsha, path = m.group("owner","repo","oldsha","path") key = (owner, repo, path) print(f"Key: {key}") if key not in cache: cache[key] = get_latest_sha(owner, repo, path, token) newsha = cache[key] if newsha and newsha != oldsha: print(f"Updating {m.group(0)} to {newsha}") return m.group(0).replace(oldsha, newsha) return m.group(0) return pattern.sub(replacer, text) def main(): p = argparse.ArgumentParser(description="Update GitHub blob links to latest SHAs") p.add_argument("file", nargs="?", help="Markdown file to update (defaults to stdin/stdout)") args = p.parse_args() token = os.getenv("GITHUB_TOKEN") if args.file: text = open(args.file, encoding="utf-8").read() updated = process_markdown(text, token) with open(args.file, "w", encoding="utf-8") as f: f.write(updated) else: text = sys.stdin.read() sys.stdout.write(process_markdown(text, token)) if __name__ == "__main__": main() ================================================ FILE: rust-toolchain ================================================ 1.93.1 ================================================ FILE: rustfmt.toml ================================================ use_small_heuristics = "Max" ================================================ FILE: src/bin/toydb.rs ================================================ //! The toyDB server. Takes configuration from a config file (default //! config/toydb.yaml) or corresponding TOYDB_ environment variables. Listens //! for SQL clients (default port 9601) and Raft connections from other toyDB //! peers (default port 9701). The Raft log and SQL database are stored at //! data/raft and data/sql by default. //! //! Use the toysql command-line client to connect to the server. #![warn(clippy::all)] use std::collections::HashMap; use std::path::Path; use clap::Parser as _; use serde::Deserialize; use toydb::Server; use toydb::errinput; use toydb::error::Result; use toydb::raft; use toydb::sql; use toydb::storage; fn main() { if let Err(error) = Command::parse().run() { eprintln!("Error: {error}") } } /// The toyDB server configuration. Can be provided via config file (default /// config/toydb.yaml) or TOYDB_ environment variables. #[derive(Debug, Deserialize)] struct Config { /// The node ID. Must be unique in the cluster. id: raft::NodeID, /// The other nodes in the cluster, and their Raft TCP addresses. peers: HashMap, /// The Raft listen address. listen_raft: String, /// The SQL listen address. listen_sql: String, /// The log level. log_level: String, /// The path to this node's data directory. The Raft log is stored in /// the file "raft", and the SQL state machine in "sql". data_dir: String, /// The Raft storage engine: bitcask or memory. storage_raft: String, /// The SQL storage engine: bitcask or memory. storage_sql: String, /// If false, don't fsync Raft log writes to disk. Disabling this /// will yield much better write performance, but may lose data on /// host crashes which compromises Raft safety guarantees. fsync: bool, /// The garbage fraction threshold at which to trigger compaction. compact_threshold: f64, /// The minimum bytes of garbage before triggering compaction. compact_min_bytes: u64, } impl Config { /// Loads the configuration from the given file. fn load(file: &str) -> Result { Ok(config::Config::builder() .set_default("id", "1")? .set_default("listen_sql", "localhost:9601")? .set_default("listen_raft", "localhost:9701")? .set_default("log_level", "info")? .set_default("data_dir", "data")? .set_default("storage_raft", "bitcask")? .set_default("storage_sql", "bitcask")? .set_default("fsync", true)? .set_default("compact_threshold", 0.2)? .set_default("compact_min_bytes", 1_000_000)? .add_source(config::File::with_name(file)) .add_source(config::Environment::with_prefix("TOYDB")) .build()? .try_deserialize()?) } } /// The toyDB server command. #[derive(clap::Parser)] #[command(about = "Starts a toyDB server.", version, propagate_version = true)] struct Command { /// The configuration file path. #[arg(short = 'c', long, default_value = "config/toydb.yaml")] config: String, } impl Command { /// Runs the toyDB server. fn run(self) -> Result<()> { // Load the configuration. let cfg = Config::load(&self.config)?; // Initialize logging. let loglevel = cfg.log_level.parse()?; let mut logconfig = simplelog::ConfigBuilder::new(); if loglevel != simplelog::LevelFilter::Debug { logconfig.add_filter_allow_str("toydb"); } simplelog::SimpleLogger::init(loglevel, logconfig.build())?; // Initialize the Raft log storage engine. let datadir = Path::new(&cfg.data_dir); let mut raft_log = match cfg.storage_raft.as_str() { "bitcask" | "" => { let engine = storage::BitCask::new_maybe_compact( datadir.join("raft"), cfg.compact_threshold, cfg.compact_min_bytes, )?; raft::Log::new(Box::new(engine))? } "memory" => raft::Log::new(Box::new(storage::Memory::new()))?, name => return errinput!("invalid Raft storage engine {name}"), }; raft_log.enable_fsync(cfg.fsync); // Initialize the SQL storage engine. let raft_state: Box = match cfg.storage_sql.as_str() { "bitcask" | "" => { let engine = storage::BitCask::new_maybe_compact( datadir.join("sql"), cfg.compact_threshold, cfg.compact_min_bytes, )?; Box::new(sql::engine::Raft::new_state(engine)?) } "memory" => Box::new(sql::engine::Raft::new_state(storage::Memory::new())?), name => return errinput!("invalid SQL storage engine {name}"), }; // Start the server. Server::new(cfg.id, cfg.peers, raft_log, raft_state)? .serve(&cfg.listen_raft, &cfg.listen_sql) } } ================================================ FILE: src/bin/toydump.rs ================================================ //! toydump is a debug tool that prints a toyDB BitCask database in //! human-readable form. It can print both the SQL database and the Raft log //! (via --raft). It only outputs live BitCask data, not garbage entries. #![warn(clippy::all)] use clap::Parser as _; use toydb::encoding::format::{self, Formatter as _}; use toydb::error::Result; use toydb::storage::{BitCask, Engine as _}; fn main() { if let Err(error) = Command::parse().run() { eprintln!("Error: {error}") } } /// The toydump command. #[derive(clap::Parser)] #[command(about = "Prints toyDB file contents.", version, propagate_version = true)] struct Command { /// The BitCask file to dump (SQL database unless --raft). file: String, /// The file is a Raft log, not SQL database. #[arg(long)] raft: bool, /// Also show raw key and value. #[arg(long)] raw: bool, } impl Command { /// Runs the command. fn run(self) -> Result<()> { let mut engine = BitCask::new(self.file.into())?; let mut scan = engine.scan(..); while let Some((key, value)) = scan.next().transpose()? { let mut string = match self.raft { true => format::Raft::::key_value(&key, &value), false => format::MVCC::::key_value(&key, &value), }; if self.raw { string = format!("{string} [{}]", format::Raw::key_value(&key, &value)) } println!("{string}"); } Ok(()) } } ================================================ FILE: src/bin/toysql.rs ================================================ //! toySQL is a command-line client for toyDB. It connects to a toyDB node //! (default localhost:9601) and executes SQL statements against it via an //! interactive shell interface. Command history is stored in .toysql.history. #![warn(clippy::all)] use std::path::PathBuf; use clap::Parser as _; use itertools::Itertools as _; use rustyline::error::ReadlineError; use rustyline::history::DefaultHistory; use rustyline::validate::{ValidationContext, ValidationResult, Validator}; use rustyline::{Editor, Modifiers}; use rustyline_derive::{Completer, Helper, Highlighter, Hinter}; use toydb::Client; use toydb::errinput; use toydb::error::Result; use toydb::sql::execution::StatementResult; use toydb::sql::parser::{Lexer, Token}; fn main() { if let Err(error) = Command::parse().run() { eprintln!("Error: {error}"); } } /// The toySQL command. #[derive(clap::Parser)] #[command(about = "A toyDB client.", version, propagate_version = true)] struct Command { /// A SQL statement to execute, then exit. #[arg()] statement: Option, /// Host to connect to. #[arg(short = 'H', long, default_value = "localhost")] host: String, /// Port number to connect to. #[arg(short = 'p', long, default_value = "9601")] port: u16, } impl Command { /// Runs the command. fn run(self) -> Result<()> { let mut shell = Shell::new(&self.host, self.port)?; match self.statement { Some(statement) => shell.execute(&statement), None => shell.run(), } } } /// An interactive toySQL shell. struct Shell { /// The toyDB client. client: Client, /// The Rustyline command editor. editor: Editor, /// The path to the history file, if any. history_path: Option, /// If true, SELECT column headers will be displayed. show_headers: bool, } impl Shell { /// Creates a new shell connected to the given server. fn new(host: &str, port: u16) -> Result { let client = Client::connect((host, port))?; // Set up Rustyline. Make sure multiline pastes are handled normally. let mut editor = Editor::new()?; editor.set_helper(Some(InputValidator)); editor.bind_sequence( rustyline::KeyEvent(rustyline::KeyCode::BracketedPasteStart, Modifiers::NONE), rustyline::Cmd::Noop, ); let history_path = std::env::var_os("HOME").map(|home| PathBuf::from(home).join(".toysql.history")); Ok(Self { client, editor, history_path, show_headers: false }) } /// Executes a SQL statement or ! command. fn execute(&mut self, input: &str) -> Result<()> { if input.starts_with('!') { self.execute_command(input) } else if !input.is_empty() { self.execute_sql(input) } else { Ok(()) } } /// Executes a toySQL ! command (e.g. !help) fn execute_command(&mut self, input: &str) -> Result<()> { let mut input = input.split_ascii_whitespace(); let Some(command) = input.next() else { return errinput!("expected command"); }; let args = input.collect_vec(); match (command, args.as_slice()) { // Toggles column headers. ("!headers", []) => { self.show_headers = !self.show_headers; match self.show_headers { true => println!("Headers enabled"), false => println!("Headers disabled"), } } ("!headers", _) => return errinput!("!headers takes no arguments"), // Displays help. ("!help", []) => println!( r#" Enter a SQL statement terminated by a semicolon (;) to execute it, or Ctrl-D to exit. The following commands are also available: !headers Toggles column headers !help This help message !status Display server status !table NAME Display a table schema !tables List tables "# ), ("!help", _) => return errinput!("!help takes no arguments"), // Displays server status. ("!status", []) => { let status = self.client.status()?; println!( r#" Server: n{server} with Raft leader n{leader} in term {term} for {nodes} nodes Raft log: {committed} committed, {applied} applied, {raft_size} MB, {raft_garbage}% garbage ({raft_storage} engine) Replication: {raft_match} SQL storage: {sql_keys} keys, {sql_size} MB logical, {nodes}x {sql_disk_size} MB disk, {sql_garbage}% garbage ({sql_storage} engine) Transactions: {active_txns} active, {versions} total "#, server = status.server, leader = status.raft.leader, term = status.raft.term, nodes = status.raft.match_index.len(), committed = status.raft.commit_index, applied = status.raft.applied_index, raft_size = format_args!("{:.3}", status.raft.storage.size as f64 / 1_000_000.0), raft_garbage = format_args!("{:.0}", status.raft.storage.garbage_disk_percent()), raft_storage = status.raft.storage.name, raft_match = status.raft.match_index.iter().map(|(n, m)| format!("n{n}:{m}")).join(" "), sql_keys = status.mvcc.storage.keys, sql_size = format_args!("{:.3}", status.mvcc.storage.size as f64 / 1_000_000.0), sql_disk_size = format_args!("{:.3}", status.mvcc.storage.disk_size as f64 / 1_000_000.0), sql_garbage = format_args!("{:.0}", status.mvcc.storage.garbage_disk_percent()), sql_storage = status.mvcc.storage.name, active_txns = status.mvcc.active_txns, versions = status.mvcc.versions, ) } ("!status", _) => return errinput!("!status takes no arguments"), ("!table", [name]) => println!("{}", self.client.get_table(name)?), ("!table", _) => return errinput!("!table takes 1 argument"), ("!tables", []) => self.client.list_tables()?.iter().for_each(|t| println!("{t}")), ("!tables", _) => return errinput!("!tables takes no arguments"), (command, _) => return errinput!("unknown command {command}"), } Ok(()) } /// Executes a SQL statement and displays the results. fn execute_sql(&mut self, statement: &str) -> Result<()> { use StatementResult::*; match self.client.execute(statement)? { Begin(state) => match state.read_only { true => println!("Began read-only transaction at version {}", state.version), false => println!("Began transaction {}", state.version), }, Commit { version } => println!("Committed transaction {version}"), Rollback { version } => println!("Rolled back transaction {version}"), Insert { count } => println!("Inserted {count} rows"), Delete { count } => println!("Deleted {count} rows"), Update { count } => println!("Updated {count} rows"), CreateTable { name } => println!("Created table {name}"), DropTable { name, existed } => match existed { true => println!("Dropped table {name}"), false => println!("Table {name} does not exist"), }, Explain(plan) => println!("{plan}"), Select { columns, rows } => { if self.show_headers { println!("{}", columns.iter().map(|c| c.as_header()).join(", ")); } for row in rows { println!("{}", row.iter().join(", ")); } } } Ok(()) } /// Prompts the user for input. Returns None if the shell should close. fn prompt(&mut self) -> rustyline::Result { let prompt = match self.client.txn() { Some(txn) if txn.read_only => format!("toydb@{}> ", txn.version), Some(txn) => format!("toydb:{}> ", txn.version), None => "toydb> ".to_string(), }; self.editor.readline(&prompt) } /// Runs the interactive shell. fn run(&mut self) -> Result<()> { // Load the history file, if any. if let Some(history_path) = &self.history_path { match self.editor.load_history(history_path) { Ok(()) => {} Err(ReadlineError::Io(error)) if error.kind() == std::io::ErrorKind::NotFound => {} Err(error) => return Err(error.into()), } } // Print welcome message. let server = self.client.status()?.server; println!("Connected to toyDB node n{server}. Enter !help for instructions."); // Prompt for commands and execute them. loop { let input = match self.prompt() { Ok(input) => input.trim().to_string(), Err(ReadlineError::Interrupted) => continue, Err(ReadlineError::Eof) => break, Err(error) => return Err(error.into()), }; self.editor.add_history_entry(&input)?; if let Err(error) = self.execute(&input) { eprintln!("Error: {error}"); }; } // Save the history file. if let Some(history_path) = &self.history_path { self.editor.save_history(history_path)?; } Ok(()) } } /// A Rustyline helper for multiline editing. After a new line is entered, it /// determines whether the input makes up a complete SQL statement that should /// be submitted to the server (i.e. it's terminated by ;), or wait for further /// input. #[derive(Completer, Helper, Highlighter, Hinter)] struct InputValidator; impl Validator for InputValidator { fn validate(&self, ctx: &mut ValidationContext) -> rustyline::Result { let input = ctx.input(); // Empty lines and ! commands are ready. if input.is_empty() || input.starts_with('!') || input == ";" { return Ok(ValidationResult::Valid(None)); } // For SQL statements, just look for any semicolon or lexer error, and // rely on the server for further validation and error handling. if Lexer::new(input).any(|r| matches!(r, Ok(Token::Semicolon) | Err(_))) { return Ok(ValidationResult::Valid(None)); } // Otherwise, wait for more input. Ok(ValidationResult::Incomplete) } fn validate_while_typing(&self) -> bool { false // only check after completed lines } } ================================================ FILE: src/bin/workload.rs ================================================ //! Runs toyDB workload benchmarks. By default, it assumes a running 5-node //! cluster as launched via cluster/run.sh, but this can be modified via -H. //! For example, a read-only workload can be run as: //! //! cargo run --release --bin workload -- read //! //! See --help for a list of available workloads and arguments. #![warn(clippy::all)] use std::cmp::min; use std::collections::HashSet; use std::io::Write as _; use std::time::{Duration, Instant}; use clap::Parser; use hdrhistogram::Histogram; use itertools::Itertools as _; use rand::SeedableRng as _; use rand::distr::Distribution as _; use rand::rngs::StdRng; use rand::seq::IndexedRandom as _; use toydb::error::Result; use toydb::sql::types::{Row, Rows}; use toydb::{Client, StatementResult}; fn main() { let Command { runner, subcommand } = Command::parse(); let result = match subcommand { Subcommand::Read(read) => runner.run(read), Subcommand::Write(write) => runner.run(write), Subcommand::Bank(bank) => runner.run(bank), }; if let Err(error) = result { eprintln!("Error: {error}") } } /// Handles command-line parsing. #[derive(clap::Parser)] #[command(about = "Runs toyDB workload benchmarks.", version, propagate_version = true)] struct Command { #[command(flatten)] runner: Runner, #[command(subcommand)] subcommand: Subcommand, } #[derive(clap::Subcommand)] enum Subcommand { Read(Read), Write(Write), Bank(Bank), } /// Runs a workload benchmark. #[derive(clap::Args)] struct Runner { /// Hosts to connect to (optionally with port number). #[arg( short = 'H', long, value_delimiter = ',', default_value = "localhost:9601,localhost:9602,localhost:9603,localhost:9604,localhost:9605" )] hosts: Vec, /// Number of concurrent workers to spawn. #[arg(short, long, default_value = "16")] concurrency: usize, /// Number of transactions to execute. #[arg(short = 'n', long, default_value = "100000")] count: usize, /// Seed to use for random number generation. #[arg(short, long, default_value = "16791084677885396490")] seed: u64, } impl Runner { /// Runs the specified workload. fn run(self, workload: W) -> Result<()> { let mut rng = StdRng::seed_from_u64(self.seed); let mut client = Client::connect(&self.hosts[0])?; // Set up a histogram recording txn latencies as nanoseconds. The // buckets range from 0.001s to 10s. let mut hist = Histogram::::new_with_bounds(1_000, 10_000_000_000, 3)?.into_sync(); // Prepare the dataset. print!("Preparing initial dataset... "); std::io::stdout().flush()?; let start = Instant::now(); workload.prepare(&mut client, &mut rng)?; println!("done ({:.3}s)", start.elapsed().as_secs_f64()); // Spawn workers, round robin across hosts. std::thread::scope(|s| -> Result<()> { print!("Spawning {} workers... ", self.concurrency); std::io::stdout().flush()?; let start = Instant::now(); let (work_tx, work_rx) = crossbeam::channel::bounded(self.concurrency); let (done_tx, done_rx) = crossbeam::channel::bounded::<()>(0); for addr in self.hosts.iter().cycle().take(self.concurrency) { let mut client = Client::connect(addr)?; let mut recorder = hist.recorder(); let work_rx = work_rx.clone(); let done_tx = done_tx.clone(); s.spawn(move || -> Result<()> { while let Ok(item) = work_rx.recv() { let start = Instant::now(); client.with_retry(|client| W::execute(client, &item))?; recorder.record(start.elapsed().as_nanos() as u64)?; } drop(done_tx); // disconnects done_rx once all workers exit Ok(()) }); } drop(done_tx); // drop local copy println!("done ({:.3}s)", start.elapsed().as_secs_f64()); // Spawn work generator. { println!("Running workload {}...", workload); let generator = workload.generate(rng)?.take(self.count); s.spawn(move || -> Result<()> { for item in generator { work_tx.send(item)?; } Ok(()) }); } // Periodically print stats until all workers are done. let start = Instant::now(); let ticker = crossbeam::channel::tick(Duration::from_secs(1)); println!(); println!("Time Progress Txns Rate p50 p90 p99 pMax"); while let Err(crossbeam::channel::TryRecvError::Empty) = done_rx.try_recv() { crossbeam::select! { recv(ticker) -> _ => {}, recv(done_rx) -> _ => {}, } let duration = start.elapsed().as_secs_f64(); hist.refresh_timeout(Duration::from_secs(1)); println!( "{:<8} {:>5.1}% {:>7} {:>6.0}/s {:>6.1}ms {:>6.1}ms {:>6.1}ms {:>6.1}ms", format!("{:.1}s", duration), hist.len() as f64 / self.count as f64 * 100.0, hist.len(), hist.len() as f64 / duration, Duration::from_nanos(hist.value_at_quantile(0.5)).as_secs_f64() * 1000.0, Duration::from_nanos(hist.value_at_quantile(0.9)).as_secs_f64() * 1000.0, Duration::from_nanos(hist.value_at_quantile(0.99)).as_secs_f64() * 1000.0, Duration::from_nanos(hist.max()).as_secs_f64() * 1000.0, ); } Ok(()) })?; // Verify the final dataset. println!(); print!("Verifying dataset... "); std::io::stdout().flush()?; let start = Instant::now(); workload.verify(&mut client, self.count)?; println!("done ({:.3}s)", start.elapsed().as_secs_f64()); Ok(()) } } /// A workload. trait Workload: std::fmt::Display { /// A work item. type Item: Send; /// Prepares the workload by creating initial tables and data. fn prepare(&self, client: &mut Client, rng: &mut StdRng) -> Result<()>; /// Generates work items as an iterator. fn generate(&self, rng: StdRng) -> Result + Send + 'static>; /// Executes a single work item. This will automatically be retried on /// certain errors, and must use a transaction where appropriate. fn execute(client: &mut Client, item: &Self::Item) -> Result<()>; /// Verifies the dataset after the workload has completed. fn verify(&self, _client: &mut Client, _txns: usize) -> Result<()> { Ok(()) } } /// A read-only workload. Creates an id,value table and populates it with the /// given row count and value size. Then runs batches of random primary key /// lookups (SELECT * FROM read WHERE id = 1 OR id = 2 ...). #[derive(clap::Args, Clone)] #[command(about = "A read-only workload using primary key lookups")] struct Read { /// Total number of rows in data set. #[arg(short, long, default_value = "1000")] rows: u64, /// Row value size (excluding primary key). #[arg(short, long, default_value = "64")] size: usize, /// Number of rows to fetch in a single select. #[arg(short, long, default_value = "1")] batch: usize, } impl std::fmt::Display for Read { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "read (rows={} size={} batch={})", self.rows, self.size, self.batch) } } impl Workload for Read { type Item = HashSet; fn prepare(&self, client: &mut Client, rng: &mut StdRng) -> Result<()> { client.execute("BEGIN")?; client.execute(r#"DROP TABLE IF EXISTS "read""#)?; client.execute(r#"CREATE TABLE "read" (id INT PRIMARY KEY, value STRING NOT NULL)"#)?; let chars = &mut rand::distr::Alphanumeric.sample_iter(rng).map(|b| b as char); let rows = (1..=self.rows).map(|id| (id, chars.take(self.size).collect::())); let chunks = rows.chunks(100); let queries = chunks.into_iter().map(|chunk| { format!( r#"INSERT INTO "read" (id, value) VALUES ({})"#, chunk.map(|(id, value)| format!("{}, '{}'", id, value)).join("), (") ) }); for query in queries { client.execute(&query)?; } client.execute("COMMIT")?; Ok(()) } fn generate(&self, rng: StdRng) -> Result + 'static> { Ok(ReadGenerator { batch: self.batch, dist: rand::distr::Uniform::new(1, self.rows + 1)?, rng, }) } fn execute(client: &mut Client, item: &Self::Item) -> Result<()> { let batch_size = item.len(); let query = format!( r#"SELECT * FROM "read" WHERE {}"#, item.iter().map(|id| format!("id = {}", id)).join(" OR ") ); let rows: Rows = client.execute(&query)?.try_into()?; assert_eq!(rows.count(), batch_size, "Unexpected row count"); Ok(()) } fn verify(&self, client: &mut Client, _: usize) -> Result<()> { let count: i64 = client.execute(r#"SELECT COUNT(*) FROM "read""#)?.try_into()?; assert_eq!(count, self.rows as i64, "Unexpected row count"); Ok(()) } } /// A Read workload generator, yielding batches of random, unique primary keys. struct ReadGenerator { batch: usize, rng: StdRng, dist: rand::distr::Uniform, } impl Iterator for ReadGenerator { type Item = ::Item; fn next(&mut self) -> Option { let mut ids = HashSet::new(); for id in self.dist.sample_iter(&mut self.rng) { ids.insert(id); if ids.len() >= self.batch { break; } } Some(ids) } } /// A write-only workload. Creates an id,value table, and writes rows with /// sequential primary keys and the given value size, in the given batch size /// (INSERT INTO write (id, value) VALUES ...). The number of rows written /// is given by Runner.count * Write.batch. #[derive(clap::Args, Clone)] #[command(about = "A write-only workload writing sequential rows")] struct Write { /// Row value size (excluding primary key). #[arg(short, long, default_value = "64")] size: usize, /// Number of rows to write in a single insert query. #[arg(short, long, default_value = "1")] batch: usize, } impl std::fmt::Display for Write { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "write (size={} batch={})", self.size, self.batch) } } impl Workload for Write { type Item = Vec<(u64, String)>; fn prepare(&self, client: &mut Client, _: &mut StdRng) -> Result<()> { client.execute("BEGIN")?; client.execute(r#"DROP TABLE IF EXISTS "write""#)?; client.execute(r#"CREATE TABLE "write" (id INT PRIMARY KEY, value STRING NOT NULL)"#)?; client.execute("COMMIT")?; Ok(()) } fn generate(&self, rng: StdRng) -> Result + 'static> { Ok(WriteGenerator { next_id: 1, size: self.size, batch: self.batch, rng }) } fn execute(client: &mut Client, item: &Self::Item) -> Result<()> { let batch_size = item.len(); let query = format!( r#"INSERT INTO "write" (id, value) VALUES {}"#, item.iter().map(|(id, value)| format!("({}, '{}')", id, value)).join(", ") ); if let StatementResult::Insert { count } = client.execute(&query)? { assert_eq!(count as usize, batch_size, "Unexpected row count"); } else { panic!("Unexpected result") } Ok(()) } fn verify(&self, client: &mut Client, txns: usize) -> Result<()> { let count: i64 = client.execute(r#"SELECT COUNT(*) FROM "write""#)?.try_into()?; assert_eq!(count as usize, txns * self.batch, "Unexpected row count"); Ok(()) } } /// A Write workload generator, yielding batches of sequential primary keys and /// random rows. struct WriteGenerator { next_id: u64, size: usize, batch: usize, rng: StdRng, } impl Iterator for WriteGenerator { type Item = ::Item; fn next(&mut self) -> Option { let chars = &mut rand::distr::Alphanumeric.sample_iter(&mut self.rng).map(|b| b as char); let mut rows = Vec::with_capacity(self.batch); while rows.len() < self.batch { rows.push((self.next_id, chars.take(self.size).collect())); self.next_id += 1; } Some(rows) } } /// A bank workload. Creates a set of customers and accounts, and makes random /// transfers between them. Specifically, it picks two random customers A and B, /// and then finds A's highest-balance account and B's lowest-balance account, /// and transfers a random amount without overdrawing the account. This /// somewhat convoluted scheme is used to make the workload slightly less /// trivial, including joins, ordering, and secondary indexes. #[derive(clap::Args, Clone)] #[command(about = "A bank workload, making transfers between customer accounts")] struct Bank { /// Number of customers. #[arg(short, long, default_value = "100")] customers: u64, /// Number of accounts per customer. #[arg(short, long, default_value = "10")] accounts: u64, /// Initial account balance. #[arg(short, long, default_value = "100")] balance: u64, /// Max amount to transfer. #[arg(short, long, default_value = "50")] max_transfer: u64, } impl std::fmt::Display for Bank { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "bank (customers={} accounts={})", self.customers, self.accounts) } } impl Workload for Bank { type Item = (u64, u64, u64); // from,to,amount fn prepare(&self, client: &mut Client, rng: &mut StdRng) -> Result<()> { let petnames = petname::Petnames::default(); client.execute("BEGIN")?; client.execute("DROP TABLE IF EXISTS account")?; client.execute("DROP TABLE IF EXISTS customer")?; client.execute( "CREATE TABLE customer ( id INTEGER PRIMARY KEY, name STRING NOT NULL )", )?; client.execute( "CREATE TABLE account ( id INTEGER PRIMARY KEY, customer_id INTEGER NOT NULL INDEX REFERENCES customer, balance INTEGER NOT NULL )", )?; client.execute(&format!( "INSERT INTO customer VALUES {}", (1..=self.customers) .map(|id| { let name = [ *petnames.adverbs.choose(rng).expect("no adverb"), *petnames.adjectives.choose(rng).expect("no adjective"), *petnames.nouns.choose(rng).expect("no noun"), ] .join(" "); (id, name) }) .map(|(id, name)| format!("({}, '{}')", id, name)) .join(", ") ))?; client.execute(&format!( "INSERT INTO account VALUES {}", (1..=self.customers) .flat_map(|c| (1..=self.accounts).map(move |a| (c, (c - 1) * self.accounts + a))) .map(|(c, a)| format!("({}, {}, {})", a, c, self.balance)) .join(", ") ))?; client.execute("COMMIT")?; Ok(()) } fn generate(&self, rng: StdRng) -> Result + 'static> { let customers = self.customers; let max_transfer = self.max_transfer; // Generate random u64s, then pick random from,to,amount as the // remainder of the max customer and amount. Ok(rand::distr::Uniform::new_inclusive(0, u64::MAX)? .sample_iter(rng) .tuples() .map(move |(a, b, c)| (a % customers + 1, b % customers + 1, c % max_transfer + 1)) .filter(|(from, to, _)| from != to)) } fn execute(client: &mut Client, item: &Self::Item) -> Result<()> { let &(from, to, mut amount) = item; client.execute("BEGIN")?; let row: Row = client .execute(&format!( "SELECT a.id, a.balance FROM account a JOIN customer c ON a.customer_id = c.id WHERE c.id = {} ORDER BY a.balance DESC LIMIT 1", from ))? .try_into()?; let mut row = row.into_iter(); let from_account: i64 = row.next().unwrap().try_into()?; let from_balance: i64 = row.next().unwrap().try_into()?; amount = min(amount, from_balance as u64); let to_account: i64 = client .execute(&format!( "SELECT a.id, a.balance FROM account a JOIN customer c ON a.customer_id = c.id WHERE c.id = {} ORDER BY a.balance ASC LIMIT 1", to ))? .try_into()?; client.execute(&format!( "UPDATE account SET balance = balance - {} WHERE id = {}", amount, from_account, ))?; client.execute(&format!( "UPDATE account SET balance = balance + {} WHERE id = {}", amount, to_account, ))?; client.execute("COMMIT")?; Ok(()) } fn verify(&self, client: &mut Client, _: usize) -> Result<()> { let balance: i64 = client.execute("SELECT SUM(balance) FROM account")?.try_into()?; assert_eq!(balance as u64, self.customers * self.accounts * self.balance); let negative: i64 = client.execute("SELECT COUNT(*) FROM account WHERE balance < 0")?.try_into()?; assert_eq!(negative, 0); Ok(()) } } ================================================ FILE: src/client.rs ================================================ use std::io::{BufReader, BufWriter, Write as _}; use std::net::{TcpStream, ToSocketAddrs}; use std::time::Duration; use rand::RngExt as _; use crate::encoding::Value as _; use crate::errdata; use crate::error::{Error, Result}; use crate::server::{Request, Response, Status}; use crate::sql::execution::StatementResult; use crate::sql::types::Table; use crate::storage::mvcc; /// A toyDB client. Connects to a server via TCP and submits SQL statements and /// other requests. pub struct Client { /// Inbound response stream. reader: BufReader, /// Outbound request stream. writer: BufWriter, /// The current transaction, if any. txn: Option, } impl Client { /// Connects to a toyDB server, creating a new client. pub fn connect(addr: impl ToSocketAddrs) -> Result { let socket = TcpStream::connect(addr)?; let reader = BufReader::new(socket.try_clone()?); let writer = BufWriter::new(socket); Ok(Self { reader, writer, txn: None }) } /// Sends a request to the server, returning the response. fn request(&mut self, request: Request) -> Result { request.encode_into(&mut self.writer)?; self.writer.flush()?; Result::decode_from(&mut self.reader)? } /// Executes a SQL statement. pub fn execute(&mut self, statement: &str) -> Result { let result = match self.request(Request::Execute(statement.to_string()))? { Response::Execute(result) => result, response => return errdata!("unexpected response {response:?}"), }; // Update the transaction state. match &result { StatementResult::Begin(state) => self.txn = Some(state.clone()), StatementResult::Commit { .. } => self.txn = None, StatementResult::Rollback { .. } => self.txn = None, _ => {} } Ok(result) } /// Fetches a table schema. pub fn get_table(&mut self, table: &str) -> Result { match self.request(Request::GetTable(table.to_string()))? { Response::GetTable(table) => Ok(table), response => errdata!("unexpected response: {response:?}"), } } /// Lists database tables. pub fn list_tables(&mut self) -> Result> { match self.request(Request::ListTables)? { Response::ListTables(tables) => Ok(tables), response => errdata!("unexpected response: {response:?}"), } } /// Returns server status. pub fn status(&mut self) -> Result { match self.request(Request::Status)? { Response::Status(status) => Ok(status), response => errdata!("unexpected response: {response:?}"), } } /// Returns the transaction state. pub fn txn(&self) -> Option<&mvcc::TransactionState> { self.txn.as_ref() } /// Runs the given closure, automatically retrying serialization and abort /// errors. If a transaction is open following an error, it is automatically /// rolled back. It is the caller's responsibility to use a transaction in /// the closure where appropriate (i.e. when it is not idempotent). pub fn with_retry(&mut self, f: impl Fn(&mut Client) -> Result) -> Result { const MAX_RETRIES: u32 = 10; const MIN_WAIT: u64 = 10; const MAX_WAIT: u64 = 2_000; let mut retries: u32 = 0; loop { match f(self) { Ok(result) => return Ok(result), Err(Error::Serialization | Error::Abort) if retries < MAX_RETRIES => { if self.txn().is_some() { self.execute("ROLLBACK")?; } // Use exponential backoff starting at MIN_WAIT doubling up // to MAX_WAIT, but randomize the wait time in this interval // to reduce the chance of collisions. let mut wait = MAX_WAIT.min(MIN_WAIT * 2_u64.pow(retries)); wait = rand::rng().random_range(MIN_WAIT..=wait); std::thread::sleep(Duration::from_millis(wait)); retries += 1; } Err(error) => { if self.txn().is_some() { self.execute("ROLLBACK").ok(); // ignore rollback error } return Err(error); } } } } } ================================================ FILE: src/encoding/bincode.rs ================================================ //! Bincode is used to encode values, both in key/value stores and the toyDB //! network protocol. It is a Rust-specific encoding that depends on the //! internal data structures being stable, but it's sufficient for toyDB. See: //! //! //! This module wraps the [`bincode`] crate and uses the standard config. use std::io::{Read, Write}; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use crate::error::{Error, Result}; /// Use the standard Bincode configuration. const CONFIG: bincode::config::Configuration = bincode::config::standard(); /// Serializes a value using Bincode. pub fn serialize(value: &T) -> Vec { // Panic on failure, as this is a problem with the data structure. bincode::serde::encode_to_vec(value, CONFIG).expect("value must be serializable") } /// Deserializes a value using Bincode. pub fn deserialize<'de, T: Deserialize<'de>>(bytes: &'de [u8]) -> Result { Ok(bincode::serde::borrow_decode_from_slice(bytes, CONFIG)?.0) } /// Serializes a value to a writer using Bincode. pub fn serialize_into(mut writer: W, value: &T) -> Result<()> { bincode::serde::encode_into_std_write(value, &mut writer, CONFIG)?; Ok(()) } /// Deserializes a value from a reader using Bincode. pub fn deserialize_from(mut reader: R) -> Result { Ok(bincode::serde::decode_from_std_read(&mut reader, CONFIG)?) } /// Deserializes a value from a reader using Bincode, or returns None if the /// reader is closed. pub fn maybe_deserialize_from(mut reader: R) -> Result> { match bincode::serde::decode_from_std_read(&mut reader, CONFIG) { Ok(t) => Ok(Some(t)), Err(bincode::error::DecodeError::Io { inner, .. }) if inner.kind() == std::io::ErrorKind::UnexpectedEof || inner.kind() == std::io::ErrorKind::ConnectionReset => { Ok(None) } Err(err) => Err(Error::from(err)), } } ================================================ FILE: src/encoding/format.rs ================================================ //! Formats raw keys and values, recursively where necessary. Handles both both //! Raft, MVCC, SQL, and raw binary data. use std::collections::BTreeSet; use std::marker::PhantomData; use itertools::Itertools as _; use regex::Regex; use super::{Key as _, Value as _, bincode}; use crate::raft; use crate::sql; use crate::storage::mvcc; /// Formats encoded keys and values. pub trait Formatter { /// Formats a key. fn key(key: &[u8]) -> String; /// Formats a value. Also takes the key to determine the kind of value. fn value(key: &[u8], value: &[u8]) -> String; /// Formats a key/value pair. fn key_value(key: &[u8], value: &[u8]) -> String { Self::key_maybe_value(key, Some(value)) } /// Formats a key/value pair, where the value may not exist. fn key_maybe_value(key: &[u8], value: Option<&[u8]>) -> String { let fmtkey = Self::key(key); let fmtvalue = value.map_or("None".to_string(), |v| Self::value(key, v)); format!("{fmtkey} → {fmtvalue}") } } /// Formats raw byte slices without any decoding. pub struct Raw; impl Raw { /// Formats raw bytes as escaped ASCII strings. pub fn bytes(bytes: &[u8]) -> String { let escaped = bytes.iter().copied().flat_map(std::ascii::escape_default).collect_vec(); format!("\"{}\"", String::from_utf8_lossy(&escaped)) } } impl Formatter for Raw { fn key(key: &[u8]) -> String { Self::bytes(key) } fn value(_key: &[u8], value: &[u8]) -> String { Self::bytes(value) } } /// Formats Raft log entries. Dispatches to F to format each Raft command. pub struct Raft(PhantomData); impl Raft { /// Formats a Raft entry. pub fn entry(entry: &raft::Entry) -> String { let fmtcommand = entry.command.as_deref().map_or("None".to_string(), |c| F::value(&[], c)); format!("{}@{} {fmtcommand}", entry.index, entry.term) } } impl Formatter for Raft { fn key(key: &[u8]) -> String { let Ok(key) = raft::Key::decode(key) else { return Raw::key(key); // invalid key }; format!("raft:{key:?}") } fn value(key: &[u8], value: &[u8]) -> String { let Ok(key) = raft::Key::decode(key) else { return Raw::value(key, value); // invalid key }; match key { raft::Key::CommitIndex => { match bincode::deserialize::<(raft::Index, raft::Term)>(value) { Ok((index, term)) => format!("{index}@{term}"), Err(_) => Raw::bytes(value), } } raft::Key::TermVote => { match bincode::deserialize::<(raft::Term, Option)>(value) { Ok((term, vote)) => format!( "term={term} vote={}", vote.map_or("None".to_string(), |v| v.to_string()), ), Err(_) => Raw::bytes(value), } } raft::Key::Entry(_) => match bincode::deserialize::(value) { Ok(entry) => Self::entry(&entry), Err(_) => Raw::bytes(value), }, } } } /// Formats MVCC keys/values. Dispatches to F to format the inner key/value. pub struct MVCC(PhantomData); impl Formatter for MVCC { fn key(key: &[u8]) -> String { let Ok(key) = mvcc::Key::decode(key) else { return Raw::key(key); // invalid key }; match key { mvcc::Key::TxnWrite(version, innerkey) => { format!("mvcc:TxnWrite({version}, {})", F::key(&innerkey)) } mvcc::Key::Version(innerkey, version) => { format!("mvcc:Version({}, {version})", F::key(&innerkey)) } mvcc::Key::Unversioned(innerkey) => { format!("mvcc:Unversioned({})", F::key(&innerkey)) } mvcc::Key::NextVersion | mvcc::Key::TxnActive(_) | mvcc::Key::TxnActiveSnapshot(_) => { format!("mvcc:{key:?}") } } } fn value(key: &[u8], value: &[u8]) -> String { let Ok(key) = mvcc::Key::decode(key) else { return Raw::bytes(value); // invalid key }; match key { mvcc::Key::NextVersion => { let Ok(version) = bincode::deserialize::(value) else { return Raw::bytes(value); }; version.to_string() } mvcc::Key::TxnActiveSnapshot(_) => { let Ok(active) = bincode::deserialize::>(value) else { return Raw::bytes(value); }; format!("{{{}}}", active.iter().join(",")) } mvcc::Key::TxnActive(_) | mvcc::Key::TxnWrite(_, _) => Raw::bytes(value), mvcc::Key::Version(userkey, _) => match bincode::deserialize(value) { Ok(Some(value)) => F::value(&userkey, value), Ok(None) => "None".to_string(), Err(_) => Raw::bytes(value), }, mvcc::Key::Unversioned(userkey) => F::value(&userkey, value), } } } /// Formats SQL keys/values. pub struct SQL; impl SQL { /// Formats a list of SQL values. fn values(values: impl IntoIterator) -> String { values.into_iter().join(",") } /// Formats a table schema. fn schema(table: sql::types::Table) -> String { // Put it all on a single line. let re = Regex::new(r#"\n\s*"#).expect("invalid regex"); re.replace_all(&table.to_string(), " ").into_owned() } } impl Formatter for SQL { fn key(key: &[u8]) -> String { // Special-case the Raft applied index key. if key == sql::engine::Raft::APPLIED_INDEX_KEY { return String::from_utf8_lossy(key).into_owned(); } let Ok(key) = sql::engine::Key::decode(key) else { return Raw::key(key); // invalid key }; match key { sql::engine::Key::Table(name) => format!("sql:Table({name})"), sql::engine::Key::Index(table, column, value) => { format!("sql:Index({table}.{column}, {value})") } sql::engine::Key::Row(table, id) => { format!("sql:Row({table}, {id})") } } } fn value(key: &[u8], value: &[u8]) -> String { // Special-case the applied_index key. if key == sql::engine::Raft::APPLIED_INDEX_KEY && let Ok(applied_index) = bincode::deserialize::(value) { return applied_index.to_string(); } let Ok(key) = sql::engine::Key::decode(key) else { return Raw::key(value); }; match key { sql::engine::Key::Table(_) => { let Ok(table) = bincode::deserialize(value) else { return Raw::bytes(value); }; Self::schema(table) } sql::engine::Key::Row(_, _) => { let Ok(row) = bincode::deserialize::(value) else { return Raw::bytes(value); }; Self::values(row) } sql::engine::Key::Index(_, _, _) => { let Ok(index) = bincode::deserialize::>(value) else { return Raw::bytes(value); }; Self::values(index) } } } } /// Formats SQL Raft write commands, from the Raft log. pub struct SQLCommand; impl Formatter for SQLCommand { fn key(_key: &[u8]) -> String { // There is no key, since these are wrapped in a Raft log entry. panic!("SQL commands don't have a key"); } fn value(_key: &[u8], value: &[u8]) -> String { let Ok(write) = sql::engine::Write::decode(value) else { return Raw::bytes(value); }; let txn = match &write { sql::engine::Write::Begin => None, sql::engine::Write::Commit(txn) | sql::engine::Write::Rollback(txn) | sql::engine::Write::Delete { txn, .. } | sql::engine::Write::Insert { txn, .. } | sql::engine::Write::Update { txn, .. } | sql::engine::Write::CreateTable { txn, .. } | sql::engine::Write::DropTable { txn, .. } => Some(txn), }; let fmttxn = txn.filter(|t| !t.read_only).map_or("".to_string(), |t| format!("t{} ", t.version)); let fmtcommand = match write { sql::engine::Write::Begin => "BEGIN".to_string(), sql::engine::Write::Commit(_) => "COMMIT".to_string(), sql::engine::Write::Rollback(_) => "ROLLBACK".to_string(), sql::engine::Write::Delete { table, ids, .. } => { format!("DELETE {table} {}", ids.iter().map(|id| id.to_string()).join(",")) } sql::engine::Write::Insert { table, rows, .. } => { format!( "INSERT {table} {}", rows.into_iter().map(|row| format!("({})", SQL::values(row))).join(" ") ) } sql::engine::Write::Update { table, rows, .. } => format!( "UPDATE {table} {}", rows.into_iter().map(|(id, row)| format!("{id}→({})", SQL::values(row))).join(" ") ), sql::engine::Write::CreateTable { schema, .. } => SQL::schema(schema), sql::engine::Write::DropTable { table, .. } => format!("DROP TABLE {table}"), }; format!("{fmttxn}{fmtcommand}") } } ================================================ FILE: src/encoding/keycode.rs ================================================ //! Keycode is a lexicographical order-preserving binary encoding for use with //! keys in key/value stores. It is designed for simplicity, not efficiency //! (i.e. it does not use varints or other compression methods). //! //! Ordering is important because it allows limited scans across specific parts //! of the keyspace, e.g. scanning an individual table or using an index range //! predicate like `WHERE id < 100`. It also avoids sorting in some cases where //! the keys are already in the desired order, e.g. in the Raft log. //! //! The encoding is not self-describing: the caller must provide a concrete type //! to decode into, and the binary key must conform to its structure. //! //! Keycode supports a subset of primitive data types, encoded as follows: //! //! * [`bool`]: `0x00` for `false`, `0x01` for `true`. //! * [`u64`]: big-endian binary representation. //! * [`i64`]: big-endian binary, sign bit flipped. //! * [`f64`]: big-endian binary, sign bit flipped, all flipped if negative. //! * [`Vec`]: `0x00` escaped as `0x00ff`, terminated with `0x0000`. //! * [`String`]: like [`Vec`]. //! * Sequences: concatenation of contained elements, with no other structure. //! * Enum: the variant's index as [`u8`], then the content sequence. //! * [`crate::sql::types::Value`]: like any other enum. //! //! The canonical key representation is an enum. For example: //! //! ``` //! #[derive(Debug, Deserialize, Serialize)] //! enum Key { //! Foo, //! Bar(String), //! Baz(bool, u64, #[serde(with = "serde_bytes")] Vec), //! } //! ``` //! //! Unfortunately, byte strings such as `Vec` must be wrapped with //! [`serde_bytes::ByteBuf`] or use the `#[serde(with="serde_bytes")]` //! attribute. See . use std::ops::Bound; use itertools::Either; use serde::de::{ Deserialize, DeserializeSeed, EnumAccess, IntoDeserializer as _, SeqAccess, VariantAccess, Visitor, }; use serde::ser::{Impossible, Serialize, SerializeSeq, SerializeTuple, SerializeTupleVariant}; use crate::errdata; use crate::error::{Error, Result}; /// Serializes a key to a binary Keycode representation. /// /// In the common case, the encoded key is borrowed for a storage engine call /// and then thrown away. We could avoid a bunch of allocations by taking a /// reusable byte vector to encode into and return a reference to it, but we /// keep it simple. pub fn serialize(key: &T) -> Vec { let mut serializer = Serializer { output: Vec::new() }; // Panic on failure, as this is a problem with the data structure. key.serialize(&mut serializer).expect("key must be serializable"); serializer.output } /// Deserializes a key from a binary Keycode representation. pub fn deserialize<'a, T: Deserialize<'a>>(input: &'a [u8]) -> Result { let mut deserializer = Deserializer::from_bytes(input); let t = T::deserialize(&mut deserializer)?; if !deserializer.input.is_empty() { return errdata!( "unexpected trailing bytes {:x?} at end of key {input:x?}", deserializer.input, ); } Ok(t) } /// Generates a key range for a key prefix, used e.g. for prefix scans. /// /// The exclusive end bound is generated by adding 1 to the value of the last /// byte. If the last byte(s) is 0xff (so adding 1 would overflow), we instead /// find the latest non-0xff byte, increment that, and truncate the rest. If all /// bytes are 0xff, we scan to the end of the range, since there can't be other /// prefixes after it. pub fn prefix_range(prefix: &[u8]) -> (Bound>, Bound>) { let start = Bound::Included(prefix.to_vec()); let end = match prefix.iter().rposition(|&b| b != 0xff) { Some(i) => Bound::Excluded( prefix.iter().take(i).copied().chain(std::iter::once(prefix[i] + 1)).collect(), ), None => Bound::Unbounded, }; (start, end) } /// Serializes keys as binary byte vectors. struct Serializer { output: Vec, } impl serde::ser::Serializer for &mut Serializer { type Ok = (); type Error = Error; type SerializeSeq = Self; type SerializeTuple = Self; type SerializeTupleVariant = Self; type SerializeTupleStruct = Impossible<(), Error>; type SerializeMap = Impossible<(), Error>; type SerializeStruct = Impossible<(), Error>; type SerializeStructVariant = Impossible<(), Error>; /// bool simply uses 1 for true and 0 for false. fn serialize_bool(self, v: bool) -> Result<()> { self.output.push(if v { 1 } else { 0 }); Ok(()) } fn serialize_i8(self, _: i8) -> Result<()> { unimplemented!() } fn serialize_i16(self, _: i16) -> Result<()> { unimplemented!() } fn serialize_i32(self, _: i32) -> Result<()> { unimplemented!() } /// i64 uses the big-endian two's complement encoding, but flips the /// left-most sign bit such that negative numbers are ordered before /// positive numbers. /// /// The relative ordering of the remaining bits is already correct: -1, the /// largest negative integer, is encoded as 01111111...11111111, ordered /// after all other negative integers but before positive integers. fn serialize_i64(self, v: i64) -> Result<()> { let mut bytes = v.to_be_bytes(); bytes[0] ^= 1 << 7; // flip sign bit self.output.extend(bytes); Ok(()) } fn serialize_u8(self, _: u8) -> Result<()> { unimplemented!() } fn serialize_u16(self, _: u16) -> Result<()> { unimplemented!() } fn serialize_u32(self, _: u32) -> Result<()> { unimplemented!() } /// u64 simply uses the big-endian encoding. fn serialize_u64(self, v: u64) -> Result<()> { self.output.extend(v.to_be_bytes()); Ok(()) } fn serialize_f32(self, _: f32) -> Result<()> { unimplemented!() } /// f64 is encoded in big-endian IEEE 754 form, but it flips the sign bit to /// order positive numbers after negative numbers, and also flips all other /// bits for negative numbers to order them from smallest to largest. NaN is /// ordered at the end. fn serialize_f64(self, v: f64) -> Result<()> { let mut bytes = v.to_be_bytes(); match v.is_sign_negative() { false => bytes[0] ^= 1 << 7, // positive, flip sign bit true => bytes.iter_mut().for_each(|b| *b = !*b), // negative, flip all bits } self.output.extend(bytes); Ok(()) } fn serialize_char(self, _: char) -> Result<()> { unimplemented!() } // Strings are encoded like bytes. fn serialize_str(self, v: &str) -> Result<()> { self.serialize_bytes(v.as_bytes()) } // Byte slices are terminated by 0x0000, escaping 0x00 as 0x00ff. This // ensures that we can detect the end, and that for two overlapping slices, // the shorter one orders before the longer one. // // We can't use e.g. length prefix encoding, since it doesn't sort correctly. fn serialize_bytes(self, v: &[u8]) -> Result<()> { let bytes = v .iter() .flat_map(|&byte| match byte { 0x00 => Either::Left([0x00, 0xff].into_iter()), byte => Either::Right([byte].into_iter()), }) .chain([0x00, 0x00]); self.output.extend(bytes); Ok(()) } fn serialize_none(self) -> Result<()> { unimplemented!() } fn serialize_some(self, _: &T) -> Result<()> { unimplemented!() } fn serialize_unit(self) -> Result<()> { unimplemented!() } fn serialize_unit_struct(self, _: &'static str) -> Result<()> { unimplemented!() } /// Enum variants are serialized using their index, as a single byte. fn serialize_unit_variant(self, _: &'static str, index: u32, _: &'static str) -> Result<()> { self.output.push(index.try_into()?); Ok(()) } fn serialize_newtype_struct(self, _: &'static str, _: &T) -> Result<()> { unimplemented!() } /// Newtype variants are serialized using the variant index and inner type. fn serialize_newtype_variant( self, name: &'static str, index: u32, variant: &'static str, value: &T, ) -> Result<()> { self.serialize_unit_variant(name, index, variant)?; value.serialize(self) } /// Sequences are serialized as the concatenation of the serialized elements. fn serialize_seq(self, _: Option) -> Result { Ok(self) } /// Tuples are serialized as the concatenation of the serialized elements. fn serialize_tuple(self, _: usize) -> Result { Ok(self) } fn serialize_tuple_struct( self, _: &'static str, _: usize, ) -> Result { unimplemented!() } /// Tuple variants are serialized using the variant index and the /// concatenation of the serialized elements. fn serialize_tuple_variant( self, name: &'static str, index: u32, variant: &'static str, _: usize, ) -> Result { self.serialize_unit_variant(name, index, variant)?; Ok(self) } fn serialize_map(self, _: Option) -> Result { unimplemented!() } fn serialize_struct(self, _: &'static str, _: usize) -> Result { unimplemented!() } fn serialize_struct_variant( self, _: &'static str, _: u32, _: &'static str, _: usize, ) -> Result { unimplemented!() } } /// Sequences simply concatenate the serialized elements, with no external structure. impl SerializeSeq for &mut Serializer { type Ok = (); type Error = Error; fn serialize_element(&mut self, value: &T) -> Result<()> { value.serialize(&mut **self) } fn end(self) -> Result<()> { Ok(()) } } /// Tuples, like sequences, simply concatenate the serialized elements. impl SerializeTuple for &mut Serializer { type Ok = (); type Error = Error; fn serialize_element(&mut self, value: &T) -> Result<()> { value.serialize(&mut **self) } fn end(self) -> Result<()> { Ok(()) } } /// Tuples, like sequences, simply concatenate the serialized elements. impl SerializeTupleVariant for &mut Serializer { type Ok = (); type Error = Error; fn serialize_field(&mut self, value: &T) -> Result<()> { value.serialize(&mut **self) } fn end(self) -> Result<()> { Ok(()) } } /// Deserializes keys from byte slices into a given type. The format is not /// self-describing, so the caller must provide a concrete type to deserialize /// into. pub struct Deserializer<'de> { input: &'de [u8], } impl<'de> Deserializer<'de> { /// Creates a deserializer for a byte slice. pub fn from_bytes(input: &'de [u8]) -> Self { Deserializer { input } } /// Chops off and returns the next len bytes of the byte slice, or errors if /// there aren't enough bytes left. fn take_bytes(&mut self, len: usize) -> Result<&[u8]> { if self.input.len() < len { return errdata!("insufficient bytes, expected {len} bytes for {:x?}", self.input); } let bytes = &self.input[..len]; self.input = &self.input[len..]; Ok(bytes) } /// Decodes and chops off the next encoded byte slice. fn decode_next_bytes(&mut self) -> Result> { let mut decoded = Vec::new(); let mut iter = self.input.iter().enumerate(); let taken = loop { match iter.next() { Some((_, 0x00)) => match iter.next() { Some((i, 0x00)) => break i + 1, // terminator Some((_, 0xff)) => decoded.push(0x00), // escaped 0x00 _ => return errdata!("invalid escape sequence"), }, Some((_, b)) => decoded.push(*b), None => return errdata!("unexpected end of input"), } }; self.input = &self.input[taken..]; Ok(decoded) } } /// For details on serialization formats, see Serializer. impl<'de> serde::de::Deserializer<'de> for &mut Deserializer<'de> { type Error = Error; fn deserialize_any>(self, _: V) -> Result { panic!("must provide type, Keycode is not self-describing") } fn deserialize_bool>(self, visitor: V) -> Result { visitor.visit_bool(match self.take_bytes(1)?[0] { 0x00 => false, 0x01 => true, b => return errdata!("invalid boolean value {b}"), }) } fn deserialize_i8>(self, _: V) -> Result { unimplemented!() } fn deserialize_i16>(self, _: V) -> Result { unimplemented!() } fn deserialize_i32>(self, _: V) -> Result { unimplemented!() } fn deserialize_i64>(self, visitor: V) -> Result { let mut bytes = self.take_bytes(8)?.to_vec(); bytes[0] ^= 1 << 7; // flip sign bit visitor.visit_i64(i64::from_be_bytes(bytes.as_slice().try_into()?)) } fn deserialize_u8>(self, _: V) -> Result { unimplemented!() } fn deserialize_u16>(self, _: V) -> Result { unimplemented!() } fn deserialize_u32>(self, _: V) -> Result { unimplemented!() } fn deserialize_u64>(self, visitor: V) -> Result { visitor.visit_u64(u64::from_be_bytes(self.take_bytes(8)?.try_into()?)) } fn deserialize_f32>(self, _: V) -> Result { unimplemented!() } fn deserialize_f64>(self, visitor: V) -> Result { let mut bytes = self.take_bytes(8)?.to_vec(); match bytes[0] >> 7 { 0 => bytes.iter_mut().for_each(|b| *b = !*b), // negative, flip all bits 1 => bytes[0] ^= 1 << 7, // positive, flip sign bit _ => panic!("bits can only be 0 or 1"), } visitor.visit_f64(f64::from_be_bytes(bytes.as_slice().try_into()?)) } fn deserialize_char>(self, _: V) -> Result { unimplemented!() } fn deserialize_str>(self, visitor: V) -> Result { let bytes = self.decode_next_bytes()?; visitor.visit_str(&String::from_utf8(bytes)?) } fn deserialize_string>(self, visitor: V) -> Result { let bytes = self.decode_next_bytes()?; visitor.visit_string(String::from_utf8(bytes)?) } fn deserialize_bytes>(self, visitor: V) -> Result { let bytes = self.decode_next_bytes()?; visitor.visit_bytes(&bytes) } fn deserialize_byte_buf>(self, visitor: V) -> Result { let bytes = self.decode_next_bytes()?; visitor.visit_byte_buf(bytes) } fn deserialize_option>(self, _: V) -> Result { unimplemented!() } fn deserialize_unit>(self, _: V) -> Result { unimplemented!() } fn deserialize_unit_struct>(self, _: &'static str, _: V) -> Result { unimplemented!() } fn deserialize_newtype_struct>( self, _: &'static str, _: V, ) -> Result { unimplemented!() } fn deserialize_seq>(self, visitor: V) -> Result { visitor.visit_seq(self) } fn deserialize_tuple>(self, _: usize, visitor: V) -> Result { visitor.visit_seq(self) } fn deserialize_tuple_struct>( self, _: &'static str, _: usize, _: V, ) -> Result { unimplemented!() } fn deserialize_map>(self, _: V) -> Result { unimplemented!() } fn deserialize_struct>( self, _: &'static str, _: &'static [&'static str], _: V, ) -> Result { unimplemented!() } fn deserialize_enum>( self, _: &'static str, _: &'static [&'static str], visitor: V, ) -> Result { visitor.visit_enum(self) } fn deserialize_identifier>(self, _: V) -> Result { unimplemented!() } fn deserialize_ignored_any>(self, _: V) -> Result { unimplemented!() } } /// Sequences are simply deserialized until the byte slice is exhausted. impl<'de> SeqAccess<'de> for Deserializer<'de> { type Error = Error; fn next_element_seed>(&mut self, seed: T) -> Result> { if self.input.is_empty() { return Ok(None); } seed.deserialize(self).map(Some) } } /// Enum variants are deserialized by their index. impl<'de> EnumAccess<'de> for &mut Deserializer<'de> { type Error = Error; type Variant = Self; fn variant_seed>(self, seed: V) -> Result<(V::Value, Self::Variant)> { let index = self.take_bytes(1)?[0] as u32; let value: Result<_> = seed.deserialize(index.into_deserializer()); Ok((value?, self)) } } /// Enum variant contents are deserialized as sequences. impl<'de> VariantAccess<'de> for &mut Deserializer<'de> { type Error = Error; fn unit_variant(self) -> Result<()> { Ok(()) } fn newtype_variant_seed>(self, seed: T) -> Result { seed.deserialize(&mut *self) } fn tuple_variant>(self, _: usize, visitor: V) -> Result { visitor.visit_seq(self) } fn struct_variant>(self, _: &'static [&'static str], _: V) -> Result { unimplemented!() } } #[cfg(test)] mod tests { use std::borrow::Cow; use std::f64::consts::PI; use paste::paste; use serde::{Deserialize, Serialize}; use serde_bytes::ByteBuf; use super::*; use crate::sql::types::Value; #[derive(Debug, Deserialize, Serialize, PartialEq)] enum Key<'a> { Unit, NewType(String), Tuple(bool, #[serde(with = "serde_bytes")] Vec, u64), Cow( #[serde(with = "serde_bytes")] #[serde(borrow)] Cow<'a, [u8]>, bool, #[serde(borrow)] Cow<'a, str>, ), } /// Assert that serializing a value yields the expected byte sequence (as a /// hex-encoded string), and that deserializing it yields the original value. macro_rules! test_serialize_deserialize { ( $( $name:ident: $input:expr => $expect:literal, )* ) => { $( #[test] fn $name() -> Result<()> { let mut input = $input; let expect = $expect; let output = serialize(&input); assert_eq!(hex::encode(&output), expect, "encode failed"); let expect = input; input = deserialize(&output)?; // reuse input variable for proper type assert_eq!(input, expect, "decode failed"); Ok(()) } )* }; } /// Assert that deserializing invalid inputs results in errors. Takes byte /// slices (as hex-encoded strings) and the type to deserialize into. macro_rules! test_deserialize_error { ( $( $name:ident: $input:literal as $type:ty, )* ) => { paste! { $( #[test] #[should_panic] fn [< $name _deserialize_error >]() { let bytes = hex::decode($input).unwrap(); deserialize::<$type>(&bytes).unwrap(); } )* } }; } // Assert that serializing a value results in an error. macro_rules! test_serialize_error { ( $( $name:ident: $input:expr, )* ) => { paste! { $( #[test] #[should_panic] fn [< $name _serialize_error >]() { let input = $input; serialize(&input); } )* } }; } test_serialize_deserialize! { bool_false: false => "00", bool_true: true => "01", f64_min: f64::MIN => "0010000000000000", f64_neg_inf: f64::NEG_INFINITY => "000fffffffffffff", f64_neg_pi: -PI => "3ff6de04abbbd2e7", f64_neg_zero: -0f64 => "7fffffffffffffff", f64_zero: 0f64 => "8000000000000000", f64_pi: PI => "c00921fb54442d18", f64_max: f64::MAX => "ffefffffffffffff", f64_inf: f64::INFINITY => "fff0000000000000", // We don't test NAN here, since NAN != NAN. i64_min: i64::MIN => "0000000000000000", i64_neg_65535: -65535i64 => "7fffffffffff0001", i64_neg_1: -1i64 => "7fffffffffffffff", i64_0: 0i64 => "8000000000000000", i64_1: 1i64 => "8000000000000001", i64_65535: 65535i64 => "800000000000ffff", i64_max: i64::MAX => "ffffffffffffffff", u64_min: u64::MIN => "0000000000000000", u64_1: 1_u64 => "0000000000000001", u64_65535: 65535_u64 => "000000000000ffff", u64_max: u64::MAX => "ffffffffffffffff", bytes: ByteBuf::from(vec![0x01, 0xff]) => "01ff0000", bytes_empty: ByteBuf::new() => "0000", bytes_escape: ByteBuf::from(vec![0x00, 0x01, 0x02]) => "00ff01020000", string: "foo".to_string() => "666f6f0000", string_empty: "".to_string() => "0000", string_escape: "foo\x00bar".to_string() => "666f6f00ff6261720000", string_utf8: "👋".to_string() => "f09f918b0000", tuple: (true, u64::MAX, ByteBuf::from(vec![0x00, 0x01])) => "01ffffffffffffffff00ff010000", array_bool: [false, true, false] => "000100", vec_bool: vec![false, true, false] => "000100", vec_u64: vec![u64::MIN, u64::MAX, 65535_u64] => "0000000000000000ffffffffffffffff000000000000ffff", enum_unit: Key::Unit => "00", enum_newtype: Key::NewType("foo".to_string()) => "01666f6f0000", enum_tuple: Key::Tuple(false, vec![0x00, 0x01], u64::MAX) => "020000ff010000ffffffffffffffff", enum_cow: Key::Cow(vec![0x00, 0x01].into(), false, String::from("foo").into()) => "0300ff01000000666f6f0000", enum_cow_borrow: Key::Cow([0x00, 0x01].as_slice().into(), false, "foo".into()) => "0300ff01000000666f6f0000", value_null: Value::Null => "00", value_bool: Value::Boolean(true) => "0101", value_int: Value::Integer(-1) => "027fffffffffffffff", value_float: Value::Float(PI) => "03c00921fb54442d18", value_string: Value::String("foo".to_string()) => "04666f6f0000", } test_serialize_error! { char: 'a', f32: 0f32, i8: 0i8, i16: 0i16, i32: 0i32, i128: 0i128, u8: 0u8, u16: 0u16, u32: 0u32, u128: 0u128, some: Some(true), none: Option::::None, vec_u8: vec![0u8], } test_deserialize_error! { bool_empty: "" as bool, bool_2: "02" as bool, char: "61" as char, f32: "00000000" as f32, i8: "00" as i8, i16: "0000" as i16, i32: "00000000" as i32, i128: "00000000000000000000000000000000" as i128, u16: "0000" as u16, u32: "00000000" as u32, u64_partial: "0000" as u64, u128: "00000000000000000000000000000000" as u128, option: "00" as Option::, string_utf8_invalid: "c0" as String, tuple_partial: "0001" as (bool, bool, bool), vec_u8: "0000" as Vec, } } ================================================ FILE: src/encoding/mod.rs ================================================ //! Binary data encodings. //! //! * keycode: used for keys in the key/value store. //! * bincode: used for values in the key/value store and network protocols. pub mod bincode; pub mod format; pub mod keycode; use std::cmp::{Eq, Ord}; use std::collections::{BTreeSet, HashSet}; use std::hash::Hash; use std::io::{Read, Write}; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use crate::error::Result; /// Adds automatic Keycode encode/decode methods to key enums. These are used /// as keys in the key/value store. pub trait Key<'de>: Serialize + Deserialize<'de> { /// Decodes a key from a byte slice using Keycode. fn decode(bytes: &'de [u8]) -> Result { keycode::deserialize(bytes) } /// Encodes a key to a byte vector using Keycode. /// /// In the common case, the encoded key is borrowed for a storage engine /// call and then thrown away. We could avoid a bunch of allocations by /// taking a reusable byte vector to encode into and return a reference to /// it, but we keep it simple. fn encode(&self) -> Vec { keycode::serialize(self) } } /// Adds automatic Bincode encode/decode methods to value types. These are used /// for values in key/value storage engines, and also for e.g. network protocol /// messages and other values. pub trait Value: Serialize + DeserializeOwned { /// Decodes a value from a byte slice using Bincode. fn decode(bytes: &[u8]) -> Result { bincode::deserialize(bytes) } /// Decodes a value from a reader using Bincode. fn decode_from(reader: R) -> Result { bincode::deserialize_from(reader) } /// Decodes a value from a reader using Bincode, or returns None if the /// reader is closed. fn maybe_decode_from(reader: R) -> Result> { bincode::maybe_deserialize_from(reader) } /// Encodes a value to a byte vector using Bincode. fn encode(&self) -> Vec { bincode::serialize(self) } /// Encodes a value into a writer using Bincode. fn encode_into(&self, writer: W) -> Result<()> { bincode::serialize_into(writer, self) } } /// Blanket implementations for various types wrapping a value type. impl Value for Option {} impl Value for Result {} impl Value for Vec {} impl Value for (V1, V2) {} impl Value for HashSet {} impl Value for BTreeSet {} ================================================ FILE: src/error.rs ================================================ use std::fmt::Display; use serde::{Deserialize, Serialize}; /// toyDB errors. #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub enum Error { /// The operation was aborted and must be retried. This typically happens /// with e.g. Raft leader changes. This is used instead of implementing /// complex retry logic and replay protection in Raft. Abort, /// Invalid data, typically decoding errors or unexpected internal values. InvalidData(String), /// Invalid user input, typically parser or query errors. InvalidInput(String), /// An IO error. IO(String), /// A write was attempted in a read-only transaction. ReadOnly, /// A write transaction conflicted with a different writer and lost. The /// transaction must be retried. Serialization, } impl std::error::Error for Error {} impl Display for Error { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { Error::Abort => write!(f, "operation aborted"), Error::InvalidData(msg) => write!(f, "invalid data: {msg}"), Error::InvalidInput(msg) => write!(f, "invalid input: {msg}"), Error::IO(msg) => write!(f, "io error: {msg}"), Error::ReadOnly => write!(f, "read-only transaction"), Error::Serialization => write!(f, "serialization failure, retry transaction"), } } } impl Error { /// Returns whether the error is considered deterministic. Raft state /// machine application needs to know whether a command failure is /// deterministic on the input command -- if it is, the command can be /// considered applied and the error returned to the client, but otherwise /// the state machine must panic to prevent node divergence. pub fn is_deterministic(&self) -> bool { match self { // Aborts don't happen during application, only leader changes. But // we consider them non-deterministic in case an abort should happen // unexpectedly below Raft. Error::Abort => false, // Possible data corruption local to this node. Error::InvalidData(_) => false, // Input errors are (likely) deterministic. They might not be in // case data was corrupted in flight, but we ignore this case. Error::InvalidInput(_) => true, // IO errors are typically local to the node (e.g. faulty disk). Error::IO(_) => false, // Write commands in read-only transactions are deterministic. Error::ReadOnly => true, // Write conflicts are determinstic. Error::Serialization => true, } } } /// Constructs an Error::InvalidData for the given format string. #[macro_export] macro_rules! errdata { ($($args:tt)*) => { $crate::error::Error::InvalidData(format!($($args)*)).into() }; } /// Constructs an Error::InvalidInput for the given format string. #[macro_export] macro_rules! errinput { ($($args:tt)*) => { $crate::error::Error::InvalidInput(format!($($args)*)).into() }; } /// A toyDB Result returning Error. pub type Result = std::result::Result; impl From for Result { fn from(error: Error) -> Self { Err(error) } } impl serde::de::Error for Error { fn custom(msg: T) -> Self { Error::InvalidData(msg.to_string()) } } impl serde::ser::Error for Error { fn custom(msg: T) -> Self { Error::InvalidData(msg.to_string()) } } impl From for Error { fn from(err: bincode::error::DecodeError) -> Self { Error::InvalidData(err.to_string()) } } impl From for Error { fn from(err: bincode::error::EncodeError) -> Self { Error::InvalidData(err.to_string()) } } impl From for Error { fn from(err: config::ConfigError) -> Self { Error::InvalidInput(err.to_string()) } } impl From for Error { fn from(err: crossbeam::channel::RecvError) -> Self { Error::IO(err.to_string()) } } impl From> for Error { fn from(err: crossbeam::channel::SendError) -> Self { Error::IO(err.to_string()) } } impl From for Error { fn from(err: crossbeam::channel::TryRecvError) -> Self { Error::IO(err.to_string()) } } impl From> for Error { fn from(err: crossbeam::channel::TrySendError) -> Self { Error::IO(err.to_string()) } } impl From for Error { fn from(err: hdrhistogram::CreationError) -> Self { panic!("{err}") // faulty code } } impl From for Error { fn from(err: hdrhistogram::RecordError) -> Self { Error::InvalidInput(err.to_string()) } } impl From for Error { fn from(err: log::ParseLevelError) -> Self { Error::InvalidInput(err.to_string()) } } impl From for Error { fn from(err: log::SetLoggerError) -> Self { panic!("{err}") // faulty code } } impl From for Error { fn from(err: rand::distr::uniform::Error) -> Self { Error::InvalidInput(err.to_string()) } } impl From for Error { fn from(err: regex::Error) -> Self { panic!("{err}") // faulty code } } impl From for Error { fn from(err: rustyline::error::ReadlineError) -> Self { Error::IO(err.to_string()) } } impl From for Error { fn from(err: std::array::TryFromSliceError) -> Self { Error::InvalidData(err.to_string()) } } impl From for Error { fn from(err: std::io::Error) -> Self { Error::IO(err.to_string()) } } impl From for Error { fn from(err: std::num::ParseFloatError) -> Self { Error::InvalidInput(err.to_string()) } } impl From for Error { fn from(err: std::num::ParseIntError) -> Self { Error::InvalidInput(err.to_string()) } } impl From for Error { fn from(err: std::num::TryFromIntError) -> Self { Error::InvalidData(err.to_string()) } } impl From for Error { fn from(err: std::string::FromUtf8Error) -> Self { Error::InvalidData(err.to_string()) } } impl From> for Error { fn from(err: std::sync::PoisonError) -> Self { // This only happens when a different thread panics while holding a // mutex. This should be fatal, so we panic here too. panic!("{err}") } } ================================================ FILE: src/lib.rs ================================================ #![warn(clippy::all)] #![allow(clippy::large_enum_variant)] #![allow(clippy::module_inception)] #![allow(clippy::type_complexity)] pub mod client; pub mod encoding; pub mod error; pub mod raft; pub mod server; pub mod sql; pub mod storage; pub use client::Client; pub use server::Server; pub use sql::execution::StatementResult; ================================================ FILE: src/raft/log.rs ================================================ use std::ops::{Bound, RangeBounds}; use serde::{Deserialize, Serialize}; use super::{NodeID, Term}; use crate::encoding::{self, Key as _, Value as _, bincode}; use crate::error::Result; use crate::storage; /// A log index (entry position). Starts at 1. 0 indicates no index. pub type Index = u64; /// A log entry containing a state machine command. #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct Entry { /// The entry index. /// /// We could omit the index in the encoded value, since it's also stored in /// the key, but we keep it simple. pub index: Index, /// The term in which the entry was added. pub term: Term, /// The state machine command. None (noop) commands are used during leader /// election to commit old entries, see section 5.4.2 in the Raft paper. pub command: Option>, } impl encoding::Value for Entry {} /// A log storage key. #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub enum Key { /// A log entry, storing the term and command. Entry(Index), /// Stores the current term and vote (if any). TermVote, /// Stores the current commit index (if any). CommitIndex, } impl encoding::Key<'_> for Key {} /// The Raft log stores a sequence of arbitrary commands (typically writes) that /// are replicated across nodes and applied sequentially to the local state /// machine. Each entry contains an index, command, and the term in which the /// leader proposed it. Commands may be noops (None), which are added when a /// leader is elected (see section 5.4.2 in the Raft paper). For example: /// /// Index | Term | Command /// ------|------|------------------------------------------------------ /// 1 | 1 | None /// 2 | 1 | CREATE TABLE table (id INT PRIMARY KEY, value STRING) /// 3 | 1 | INSERT INTO table VALUES (1, 'foo') /// 4 | 2 | None /// 5 | 2 | UPDATE table SET value = 'bar' WHERE id = 1 /// 6 | 2 | DELETE FROM table WHERE id = 1 /// /// Note that this is for illustration only, and the actual toyDB Raft commands /// are not SQL statements but lower-level write operations. /// /// A key/value store is used to store the log entries on disk, keyed by index, /// along with a few other metadata keys (e.g. who we voted for in this term). /// /// In the steady state, the log is append-only: when a client submits a /// command, the leader appends it to its own log (via [`Log::append`]) and /// replicates it to followers who append it to their logs (via /// [`Log::splice`]). When an index has been replicated to a majority of nodes /// it becomes committed, making the log immutable up to that index and /// guaranteeing that all nodes will eventually contain it. Nodes keep track of /// the commit index via [`Log::commit`] and apply committed commands to the /// state machine. /// /// However, uncommitted entries can be replaced or removed. A leader may append /// entries to its log, but then be unable to reach consensus on them (e.g. /// because it is unable to communicate with a majority of nodes). If a /// different leader is elected and writes different commands to those same /// indexes, then the uncommitted entries will be replaced with entries from the /// new leader once the old leader (or a follower) discovers it. /// /// The Raft log has the following invariants: /// /// * Entry indexes are contiguous starting at 1 (no index gaps). /// * Entry terms never decrease from the previous entry. /// * Entry terms are at or below the current term. /// * Appended entries are durable (flushed to disk). /// * Appended entries use the current term. /// * Committed entries are never changed or removed (no log truncation). /// * Committed entries will eventually be replicated to all nodes. /// * Entries with the same index/term contain the same command. /// * If two logs contain a matching index/term, all previous entries /// are identical (see section 5.3 in the Raft paper). pub struct Log { /// The underlying storage engine. Uses a trait object instead of generics, /// to allow runtime selection of the engine and avoid propagating the /// generic type parameters throughout Raft. pub engine: Box, /// The current term. term: Term, /// Our leader vote in the current term, if any. vote: Option, /// The index of the last stored entry. last_index: Index, /// The term of the last stored entry. last_term: Term, /// The index of the last committed entry. commit_index: Index, /// The term of the last committed entry. commit_term: Term, /// If true, fsync entries to disk when appended. This is mandated by Raft, /// but comes with a hefty performance penalty (especially since we don't /// optimize for it by batching entries before fsyncing). Disabling it will /// yield much better write performance, but may lose data on crashes, which /// in some scenarios can cause log entries to become "uncommitted" and /// state machines diverging. fsync: bool, } impl Log { /// Initializes a log using the given storage engine. pub fn new(mut engine: Box) -> Result { // Load some initial in-memory state from disk. let (term, vote) = engine .get(&Key::TermVote.encode())? .map(|v| bincode::deserialize(&v)) .transpose()? .unwrap_or((0, None)); let (last_index, last_term) = engine .scan_dyn(( Bound::Included(Key::Entry(0).encode()), Bound::Included(Key::Entry(u64::MAX).encode()), )) .last() .transpose()? .map(|(_, v)| Entry::decode(&v)) .transpose()? .map(|e| (e.index, e.term)) .unwrap_or((0, 0)); let (commit_index, commit_term) = engine .get(&Key::CommitIndex.encode())? .map(|v| bincode::deserialize(&v)) .transpose()? .unwrap_or((0, 0)); let fsync = true; // fsync by default Ok(Self { engine, term, vote, last_index, last_term, commit_index, commit_term, fsync }) } /// Controls whether to fsync writes. Disabling this may violate Raft /// guarantees, see comment on fsync attribute. pub fn enable_fsync(&mut self, fsync: bool) { self.fsync = fsync } /// Returns the commit index and term. pub fn get_commit_index(&self) -> (Index, Term) { (self.commit_index, self.commit_term) } /// Returns the last log index and term. pub fn get_last_index(&self) -> (Index, Term) { (self.last_index, self.last_term) } /// Returns the current term (0 if none) and vote. pub fn get_term_vote(&self) -> (Term, Option) { (self.term, self.vote) } /// Stores the current term and cast vote (if any). Enforces that the term /// does not regress, and that we only vote for one node in a term. append() /// will use this term, and splice() can't write entries beyond it. pub fn set_term_vote(&mut self, term: Term, vote: Option) -> Result<()> { assert!(term > 0, "can't set term 0"); assert!(term >= self.term, "term regression {} → {}", self.term, term); assert!(term > self.term || self.vote.is_none() || vote == self.vote, "can't change vote"); if term == self.term && vote == self.vote { return Ok(()); } self.engine.set(&Key::TermVote.encode(), bincode::serialize(&(term, vote)))?; // Always fsync, even with Log::fsync = false. Term changes are rare, so // this doesn't materially affect performance, and double voting could // lead to multiple leaders and split brain which is really bad. self.engine.flush()?; self.term = term; self.vote = vote; Ok(()) } /// Appends a command to the log at the current term, and flushes it to /// disk, returning its index. None implies a noop command, typically after /// Raft leader changes. pub fn append(&mut self, command: Option>) -> Result { assert!(self.term > 0, "can't append entry in term 0"); let entry = Entry { index: self.last_index + 1, term: self.term, command }; self.engine.set(&Key::Entry(entry.index).encode(), entry.encode())?; if self.fsync { self.engine.flush()?; } self.last_index = entry.index; self.last_term = entry.term; Ok(entry.index) } /// Commits entries up to and including the given index. The index must /// exist and be at or after the current commit index. pub fn commit(&mut self, index: Index) -> Result { let term = match self.get(index)? { Some(entry) if entry.index < self.commit_index => { panic!("commit index regression {} → {}", self.commit_index, entry.index); } Some(entry) if entry.index == self.commit_index => return Ok(index), Some(entry) => entry.term, None => panic!("commit index {index} does not exist"), }; self.engine.set(&Key::CommitIndex.encode(), bincode::serialize(&(index, term)))?; // NB: the commit index doesn't need to be fsynced, since the entries // are fsynced and the commit index can be recovered from the quorum. self.commit_index = index; self.commit_term = term; Ok(index) } /// Fetches an entry at an index, or None if it does not exist. pub fn get(&mut self, index: Index) -> Result> { self.engine.get(&Key::Entry(index).encode())?.map(|v| Entry::decode(&v)).transpose() } /// Checks if the log contains an entry with the given index and term. pub fn has(&mut self, index: Index, term: Term) -> Result { // Fast path: check against last_index. This is the common case when // followers process appends or heartbeats. if index == 0 || index > self.last_index { return Ok(false); } if (index, term) == (self.last_index, self.last_term) { return Ok(true); } Ok(self.get(index)?.map(|e| e.term == term).unwrap_or(false)) } /// Returns an iterator over log entries in the given index range. pub fn scan(&mut self, range: impl RangeBounds) -> Iterator<'_> { let from = match range.start_bound() { Bound::Excluded(&index) => Bound::Excluded(Key::Entry(index).encode()), Bound::Included(&index) => Bound::Included(Key::Entry(index).encode()), Bound::Unbounded => Bound::Included(Key::Entry(0).encode()), }; let to = match range.end_bound() { Bound::Excluded(&index) => Bound::Excluded(Key::Entry(index).encode()), Bound::Included(&index) => Bound::Included(Key::Entry(index).encode()), Bound::Unbounded => Bound::Included(Key::Entry(Index::MAX).encode()), }; Iterator::new(self.engine.scan_dyn((from, to))) } /// Returns an iterator over entries that are ready to apply, starting after /// the current applied index up to the commit index. pub fn scan_apply(&mut self, applied_index: Index) -> Iterator<'_> { // NB: we don't assert that commit_index >= applied_index, because the // local commit index is not flushed to durable storage -- if lost on // restart, it can be recovered from the logs of a quorum. if applied_index >= self.commit_index { return Iterator::new(Box::new(std::iter::empty())); } self.scan(applied_index + 1..=self.commit_index) } /// Splices a set of entries into the log and flushes it to disk. New /// indexes will be appended. Overlapping indexes with the same term must be /// equal and will be ignored. Overlapping indexes with different terms will /// truncate the existing log at the first conflict and then splice the new /// entries. /// /// The entries must have contiguous indexes and equal/increasing terms, and /// the first entry must be in the range [1,last_index+1] with a term at or /// above the previous (base) entry's term and at or below the current term. pub fn splice(&mut self, entries: Vec) -> Result { let (Some(first), Some(last)) = (entries.first(), entries.last()) else { return Ok(self.last_index); // empty input is noop }; // Check that the entries are well-formed. assert!(first.index > 0 && first.term > 0, "spliced entry has index or term 0",); assert!( entries.windows(2).all(|w| w[0].index + 1 == w[1].index), "spliced entries are not contiguous" ); assert!( entries.windows(2).all(|w| w[0].term <= w[1].term), "spliced entries have term regression", ); // Check that the entries connect to the existing log (if any), and that the // term doesn't regress. assert!(last.term <= self.term, "splice term {} beyond current {}", last.term, self.term); match self.get(first.index - 1)? { Some(base) if first.term < base.term => { panic!("splice term regression {} → {}", base.term, first.term) } Some(_) => {} None if first.index == 1 => {} None => panic!("first index {} must touch existing log", first.index), } // Skip entries that are already in the log. let mut entries = entries.as_slice(); let mut scan = self.scan(first.index..=last.index); while let Some(entry) = scan.next().transpose()? { // [0] is ok, because the scan has the same size as entries. assert!(entry.index == entries[0].index, "index mismatch at {entry:?}"); if entry.term != entries[0].term { break; } assert!(entry.command == entries[0].command, "command mismatch at {entry:?}"); entries = &entries[1..]; } drop(scan); // If all entries already exist then we're done. let Some(first) = entries.first() else { return Ok(self.last_index); }; // Write the entries that weren't already in the log, and remove the // tail of the old log if any. We can't write below the commit index, // since these entries must be immutable. assert!(first.index > self.commit_index, "spliced entries below commit index"); for entry in entries { self.engine.set(&Key::Entry(entry.index).encode(), entry.encode())?; } for index in last.index + 1..=self.last_index { self.engine.delete(&Key::Entry(index).encode())?; } if self.fsync { self.engine.flush()?; } self.last_index = last.index; self.last_term = last.term; Ok(self.last_index) } /// Returns log engine status. pub fn status(&mut self) -> Result { self.engine.status() } } /// A log entry iterator. pub struct Iterator<'a> { inner: Box, } impl<'a> Iterator<'a> { fn new(inner: Box) -> Self { Self { inner } } } impl std::iter::Iterator for Iterator<'_> { type Item = Result; fn next(&mut self) -> Option { self.inner.next().map(|r| r.and_then(|(_, v)| Entry::decode(&v))) } } /// Most Raft tests are Goldenscripts under src/raft/testscripts. #[cfg(test)] mod tests { use std::error::Error; use std::fmt::Write as _; use std::result::Result; use crossbeam::channel::Receiver; use itertools::Itertools as _; use regex::Regex; use tempfile::TempDir; use test_each_file::test_each_path; use super::*; use crate::encoding::format::{self, Formatter as _}; use crate::storage::engine::test as testengine; // Run goldenscript tests in src/raft/testscripts/log. test_each_path! { in "src/raft/testscripts/log" as scripts => test_goldenscript } fn test_goldenscript(path: &std::path::Path) { goldenscript::run(&mut TestRunner::new(), path).expect("goldenscript failed") } /// Runs Raft log goldenscript tests. For available commands, see run(). struct TestRunner { log: Log, op_rx: Receiver, #[allow(dead_code)] tempdir: TempDir, } impl TestRunner { fn new() -> Self { // Use both a BitCask and a Memory engine, and mirror operations // across them. Emit write events to op_tx. let (op_tx, op_rx) = crossbeam::channel::unbounded(); let tempdir = TempDir::with_prefix("toydb").expect("tempdir failed"); let bitcask = storage::BitCask::new(tempdir.path().join("bitcask")).expect("bitcask failed"); let memory = storage::Memory::new(); let engine = testengine::Emit::new(testengine::Mirror::new(bitcask, memory), op_tx); let log = Log::new(Box::new(engine)).expect("log failed"); Self { log, op_rx, tempdir } } /// Parses an index@term pair. fn parse_index_term(s: &str) -> Result<(Index, Term), Box> { let re = Regex::new(r"^(\d+)@(\d+)$").expect("invalid regex"); let groups = re.captures(s).ok_or_else(|| format!("invalid index/term {s}"))?; let index = groups.get(1).unwrap().as_str().parse()?; let term = groups.get(2).unwrap().as_str().parse()?; Ok((index, term)) } /// Parses an index range, in Rust range syntax. fn parse_index_range(s: &str) -> Result, Box> { use std::ops::Bound; let mut bound = (Bound::::Unbounded, Bound::::Unbounded); let re = Regex::new(r"^(\d+)?\.\.(=)?(\d+)?").expect("invalid regex"); let groups = re.captures(s).ok_or_else(|| format!("invalid range {s}"))?; if let Some(start) = groups.get(1) { bound.0 = Bound::Included(start.as_str().parse()?); } if let Some(end) = groups.get(3) { let end = end.as_str().parse()?; if groups.get(2).is_some() { bound.1 = Bound::Included(end) } else { bound.1 = Bound::Excluded(end) } } Ok(bound) } } impl goldenscript::Runner for TestRunner { fn run(&mut self, command: &goldenscript::Command) -> Result> { let mut output = String::new(); let mut tags = command.tags.clone(); match command.name.as_str() { // append [COMMAND] "append" => { let mut args = command.consume_args(); let command = args.next_pos().map(|a| a.value.as_bytes().to_vec()); args.reject_rest()?; let index = self.log.append(command)?; let entry = self.log.get(index)?.expect("entry not found"); let fmtentry = format::Raft::::entry(&entry); writeln!(output, "append → {fmtentry}")?; } // commit INDEX "commit" => { let mut args = command.consume_args(); let index = args.next_pos().ok_or("index not given")?.parse()?; args.reject_rest()?; let index = self.log.commit(index)?; let entry = self.log.get(index)?.expect("entry not found"); let fmtentry = format::Raft::::entry(&entry); writeln!(output, "commit → {fmtentry}")?; } // dump "dump" => { command.consume_args().reject_rest()?; let range = (std::ops::Bound::Unbounded, std::ops::Bound::Unbounded); let mut scan = self.log.engine.scan_dyn(range); while let Some((key, value)) = scan.next().transpose()? { let fmtkv = format::Raft::::key_value(&key, &value); let rawkv = format::Raw::key_value(&key, &value); writeln!(output, "{fmtkv} [{rawkv}]")?; } } // get INDEX... "get" => { let mut args = command.consume_args(); let indexes: Vec = args.rest_pos().iter().map(|a| a.parse()).try_collect()?; args.reject_rest()?; for index in indexes { let entry = self.log.get(index)?; let fmtentry = entry .as_ref() .map(format::Raft::::entry) .unwrap_or("None".to_string()); writeln!(output, "{fmtentry}")?; } } // get_term "get_term" => { command.consume_args().reject_rest()?; let (term, vote) = self.log.get_term_vote(); let vote = vote.map(|v| v.to_string()).unwrap_or("None".to_string()); writeln!(output, "term={term} vote={vote}")?; } // has INDEX@TERM... "has" => { let mut args = command.consume_args(); let indexes: Vec<(Index, Term)> = args .rest_pos() .iter() .map(|a| Self::parse_index_term(&a.value)) .try_collect()?; args.reject_rest()?; for (index, term) in indexes { let has = self.log.has(index, term)?; writeln!(output, "{has}")?; } } // reload "reload" => { command.consume_args().reject_rest()?; // To get owned access to the inner engine, temporarily // replace it with an empty memory engine. let engine = std::mem::replace(&mut self.log.engine, Box::new(storage::Memory::new())); self.log = Log::new(engine)?; } // scan [RANGE] "scan" => { let mut args = command.consume_args(); let range = Self::parse_index_range( args.next_pos().map_or("..", |a| a.value.as_str()), )?; args.reject_rest()?; let mut scan = self.log.scan(range); while let Some(entry) = scan.next().transpose()? { let fmtentry = format::Raft::::entry(&entry); writeln!(output, "{fmtentry}")?; } } // scan_apply APPLIED_INDEX "scan_apply" => { let mut args = command.consume_args(); let applied_index = args.next_pos().ok_or("applied index not given")?.parse()?; args.reject_rest()?; let mut scan = self.log.scan_apply(applied_index); while let Some(entry) = scan.next().transpose()? { let fmtentry = format::Raft::::entry(&entry); writeln!(output, "{fmtentry}")?; } } // set_term TERM [VOTE] "set_term" => { let mut args = command.consume_args(); let term = args.next_pos().ok_or("term not given")?.parse()?; let vote = args.next_pos().map(|a| a.parse()).transpose()?; args.reject_rest()?; self.log.set_term_vote(term, vote)?; } // splice [INDEX@TERM=COMMAND...] "splice" => { let mut args = command.consume_args(); let mut entries = Vec::new(); for arg in args.rest_key() { let (index, term) = Self::parse_index_term(arg.key.as_deref().unwrap())?; let command = match arg.value.as_str() { "" => None, value => Some(value.as_bytes().to_vec()), }; entries.push(Entry { index, term, command }); } args.reject_rest()?; let index = self.log.splice(entries)?; let entry = self.log.get(index)?.expect("entry not found"); let fmtentry = format::Raft::::entry(&entry); writeln!(output, "splice → {fmtentry}")?; } // status [engine=BOOL] "status" => { let mut args = command.consume_args(); let engine = args.lookup_parse("engine")?.unwrap_or(false); args.reject_rest()?; let (term, vote) = self.log.get_term_vote(); let (last_index, last_term) = self.log.get_last_index(); let (commit_index, commit_term) = self.log.get_commit_index(); let vote = vote.map(|id| id.to_string()).unwrap_or("None".to_string()); write!( output, "term={term} last={last_index}@{last_term} commit={commit_index}@{commit_term} vote={vote}", )?; if engine { write!(output, " engine={:#?}", self.log.status()?)?; } writeln!(output)?; } name => return Err(format!("unknown command {name}").into()), } // If requested, output engine operations. if tags.remove("ops") { while let Ok(op) = self.op_rx.try_recv() { match op { testengine::Operation::Delete { key } => { let fmtkey = format::Raft::::key(&key); let rawkey = format::Raw::key(&key); writeln!(output, "engine delete {fmtkey} [{rawkey}]")? } testengine::Operation::Flush => writeln!(output, "engine flush")?, testengine::Operation::Set { key, value } => { let fmtkv = format::Raft::::key_value(&key, &value); let rawkv = format::Raw::key_value(&key, &value); writeln!(output, "engine set {fmtkv} [{rawkv}]")? } } } } if let Some(tag) = tags.iter().next() { return Err(format!("unknown tag {tag}").into()); } Ok(output) } /// If requested via [ops] tag, output engine operations for the command. fn end_command(&mut self, _: &goldenscript::Command) -> Result> { // Drain any remaining engine operations. while self.op_rx.try_recv().is_ok() {} Ok(String::new()) } } } ================================================ FILE: src/raft/message.rs ================================================ use std::collections::BTreeMap; use serde::{Deserialize, Serialize}; use super::{Entry, Index, NodeID, Term}; use crate::encoding; use crate::error::Result; use crate::storage; /// A message envelope specifying the sender and receiver. #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct Envelope { /// The sender. pub from: NodeID, /// The sender's current term. pub term: Term, /// The recipient. pub to: NodeID, /// The message. pub message: Message, } impl encoding::Value for Envelope {} /// A message sent between Raft nodes. Messages are sent asynchronously (i.e. /// they are not request/response) and may be dropped or reordered. /// /// In practice, they are sent across a TCP connection and crossbeam channel /// which ensures messages are not dropped or reordered as long as the /// connection remains intact. A message and its response are sent across /// separate TCP connections (outbound from the respective sender). #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub enum Message { /// Candidates campaign for leadership by soliciting votes from peers. /// Votes will only be granted if the candidate's log is at least as /// up-to-date as the voter. Campaign { /// The index of the candidate's last log entry. last_index: Index, /// The term of the candidate's last log entry. last_term: Term, }, /// Followers may vote for a single candidate per term, but only if the /// candidate's log is at least as up-to-date as the follower. Candidates /// implicitly vote for themselves. CampaignResponse { /// If true, the follower granted the candidate a vote. A false response /// isn't necessary, but is emitted for clarity. vote: bool, }, /// Leaders send periodic heartbeats. This serves several purposes: /// /// * Inform nodes about the leader, and prevent elections. /// * Detect lost appends and reads, as a retry mechanism. /// * Advance followers' commit indexes, so they can apply entries. /// /// The Raft paper does not have a distinct heartbeat message, and instead /// uses an empty AppendEntries RPC, but we choose to add one for better /// separation of concerns. Heartbeat { /// The index of the leader's last log entry. The term is the leader's /// current term, since it appends a noop entry on election win. The /// follower compares this to its own log to determine if it's /// up-to-date. last_index: Index, /// The index of the leader's last committed log entry. Followers use /// this to advance their commit index and apply entries. It's only safe /// to commit this if the local log matches last_index, such that the /// follower's log is identical to the leader at the commit index. commit_index: Index, /// The leader's latest read sequence number in this term. read_seq: ReadSequence, }, /// Followers respond to leader heartbeats if they still consider it leader. HeartbeatResponse { /// If non-zero, the heartbeat's last_index which was matched in the /// follower's log. Otherwise, the follower is either divergent or /// lagging behind the leader. match_index: Index, /// The heartbeat's read sequence number. read_seq: ReadSequence, }, /// Leaders replicate log entries to followers by appending to their logs /// after the given base entry. /// /// If the base entry matches the follower's log then their logs are /// identical up to it (see section 5.3 in the Raft paper), and the entries /// can be appended -- possibly replacing conflicting entries. Otherwise, /// the append is rejected and the leader must retry an earlier base index /// until a common base is found. /// /// Empty appends messages (no entries) are used to probe follower logs for /// a common match index in the case of divergent logs, restarted nodes, or /// dropped messages. This is typically done by sending probes with a /// decrementing base index until a match is found, at which point the /// subsequent entries can be sent. Append { /// The index of the log entry to append after. base_index: Index, /// The term of the base entry. base_term: Term, /// Log entries to append. Must start at base_index + 1. entries: Vec, }, /// Followers accept or reject appends from the leader depending on whether /// the base entry matches their log. AppendResponse { /// If non-zero, the follower appended entries up to this index. The /// entire log up to this index is consistent with the leader. If no /// entries were sent (a probe), this will be the matching base index. match_index: Index, /// If non-zero, the follower rejected an append at this base index /// because the base index/term did not match its log. If the follower's /// log is shorter than the base index, the reject index will be lowered /// to the index after its last local index, to avoid probing each /// missing index. reject_index: Index, }, /// Leaders need to confirm they are still the leader before serving reads, /// to guarantee linearizability in case a different leader has been /// estalished elsewhere. Read requests are served once the sequence number /// has been confirmed by a quorum. Read { seq: ReadSequence }, /// Followers confirm leadership at the read sequence numbers. ReadResponse { seq: ReadSequence }, /// A client request. This can be submitted to the leader, or to a follower /// which will forward it to its leader. If there is no leader, or the /// leader or term changes, the request is aborted with an Error::Abort /// ClientResponse and the client must retry. ClientRequest { /// The request ID. Must be globally unique for the request duration. id: RequestID, /// The request itself. request: Request, }, /// A client response. ClientResponse { /// The ID of the original ClientRequest. id: RequestID, /// The response, or an error. response: Result, }, } /// A client request ID. Must be globally unique while in flight. /// /// For simplicity, a random UUIDv4 is used. We could incorporate the /// node/process/MAC ID and timestamp for better collision avoidance (e.g. via /// UUIDv6) but it doesn't matter at this scale. pub type RequestID = uuid::Uuid; /// A read sequence number, used to confirm leadership for linearizable reads. pub type ReadSequence = u64; /// A client request, typically passed through to the state machine. #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub enum Request { /// A state machine read command, executed via `State::read`. This is not /// replicated, and only evaluated on the leader. Read(Vec), /// A state machine write command, executed via `State::apply`. This is /// replicated across all nodes, and must produce a deterministic result. Write(Vec), /// Requests Raft cluster status from the leader. Status, } impl encoding::Value for Request {} /// A client response. This will be wrapped in a Result for error handling. #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub enum Response { /// A state machine read result. Read(Vec), /// A state machine write result. Write(Vec), /// The current Raft leader status. Status(Status), } impl encoding::Value for Response {} /// Raft cluster status. Generated by the leader. #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct Status { /// The current Raft leader, which generated this status. pub leader: NodeID, /// The current Raft term. pub term: Term, /// The match indexes of all nodes, indicating replication progress. Uses a /// BTreeMap for test determinism. pub match_index: BTreeMap, /// The current commit index. pub commit_index: Index, /// The current applied index. pub applied_index: Index, /// The log storage engine status. pub storage: storage::Status, } ================================================ FILE: src/raft/mod.rs ================================================ //! Implements the Raft distributed consensus protocol. //! //! For details, see Diego Ongaro's original writings: //! //! * Raft paper: //! * Raft thesis: //! * Raft website: //! //! Raft is a protocol for a group of computers to agree on some data -- or more //! simply, to replicate the data. It is broadly equivalent to [Paxos] and //! [Viewstamped Replication], but more prescriptive and simpler to understand. //! //! Raft has three main properties: //! //! * Fault tolerance: the system tolerates node failures as long as a majority //! of nodes (>50%) remain operational. //! //! * Linearizability (aka strong consistency): once a client write has been //! accepted, it is visible to all clients -- they never see outdated data. //! //! * Durability: a write is never lost as long as a majority of nodes remain. //! //! It does this by electing a single leader node which serves client requests //! and replicates writes to other nodes. Requests are executed once they have //! been confirmed by a strict majority of nodes (a quorum). If a leader fails, //! a new leader is elected. Clusters have 3 or more nodes, since a two-node //! cluster can't tolerate failures (1/2 is not a majority and would lead to //! split brain). //! //! Notably, Raft does not provide horizontal scalability. Client requests are //! processed by a single leader node which can quickly become a bottleneck, and //! each node stores a complete copy of the entire dataset. Systems often handle //! this by sharding the data into multiple Raft clusters and using a //! distributed transaction protocol across them, but this is out of scope here. //! //! toyDB follows the Raft paper fairly closely, but, like most implementations, //! takes some minor artistic liberties. //! //! [Paxos]: https://www.microsoft.com/en-us/research/uploads/prod/2016/12/paxos-simple-Copy.pdf //! [Viewstamped Replication]: https://pmg.csail.mit.edu/papers/vr-revisited.pdf //! //! RAFT LOG AND STATE MACHINE //! ========================== //! //! Raft maintains an ordered command log containing arbitrary write commands //! submitted by clients. It attempts to reach consensus on this log by //! replicating it to a majority of nodes. If successful, the log is considered //! committed and immutable up to that point. //! //! Once committed, the commands in the log are applied sequentially to a local //! state machine on each node. Raft itself doesn't care what the state machine //! and commands are -- in toyDB's case it's a SQL database, but it could be //! anything. Raft simply passes opaque commands to an opaque state machine. //! //! Each log entry contains an index, the leader's term (see next section), and //! the command. For example, a naïve illustration of a toyDB Raft log might be: //! //! Index | Term | Command //! ------|------|------------------------------------------------------ //! 1 | 1 | CREATE TABLE table (id INT PRIMARY KEY, value STRING) //! 2 | 1 | INSERT INTO table VALUES (1, 'foo') //! 3 | 2 | UPDATE table SET value = 'bar' WHERE id = 1 //! 4 | 2 | DELETE FROM table WHERE id = 1 //! //! The state machine must be deterministic, such that all nodes will reach the //! same identical state. Raft will apply the same commands in the same order //! independently on all nodes, but if the commands have non-deterministic //! behavior such as random number generation or communication with external //! systems it can lead to state divergence causing different results. //! //! In toyDB, the Raft log is managed by `Log` and stored locally in a //! `storage::Engine`. The state machine interface is the `State` trait. See //! their documentation for more details. //! //! LEADER ELECTION //! =============== //! //! Raft nodes can be in one of three states (or roles): follower, candidate, //! and leader. toyDB models these as `Node::Follower`, `Node::Candidate`, and //! `Node::Leader`. //! //! * Follower: replicates log entries from a leader. May not know a leader yet. //! * Candidate: campaigns for leadership in an election. //! * Leader: processes client requests and replicates writes to followers. //! //! Raft fundamentally relies on a single guarantee: there can be at most one //! _valid_ leader at any point in time (old, since-replaced leaders may think //! they're still a leader, e.g. during a network partition, but they won't be //! able to do anything). It enforces this through the leader election protocol. //! //! Raft divides time into terms, which are monotonically increasing numbers. //! Higher terms always take priority over lower terms. There can be at most one //! leader in a term, and it can't change. Nodes keep track of their last known //! term and store it on disk (see `Log.set_term()`). Messages between nodes are //! tagged with the current term (as `Envelope.term`) -- old terms are ignored, //! and future terms cause the node to become a follower in that term. //! //! Nodes start out as leaderless followers. If they receive a message from a //! leader (in a current or future term), they follow it. Otherwise, they wait //! out the election timeout (a few seconds), become candidates, and hold a //! leader election. //! //! Candidates increase their term by 1 and send `Message::Campaign` to all //! nodes, requesting their vote. Nodes respond with `Message::CampaignResponse` //! saying whether a vote was granted. A node can only grant a single vote in a //! term (stored to disk via `Log.set_term()`), on a first-come first-serve //! basis, and candidates implicitly vote for themselves. //! //! When a candidate receives a majority of votes (>50%), it becomes leader. It //! sends a `Message::Heartbeat` to all nodes asserting its leadership, and all //! nodes become followers when they receive it (regardless of who they voted //! for). Leaders continue to send periodic heartbeats every second or so. The //! new leader also appends an empty entry to its log in order to safely commit //! all entries from previous terms (Raft paper section 5.4.2). //! //! The new leader must have all committed entries in its log (or the cluster //! would lose data). To ensure this, there is one additional condition for //! granting a vote: the candidate's log must be at least as up-to-date as the //! voter. Because an entry must be replicated to a majority before being //! committed, this ensures a candidate can only win a majority of votes if its //! log is up-to-date with all committed entries (Raft paper section 5.4.1). //! //! It's possible that no candidate wins an election, for example due to a tie //! or a majority of nodes being offline. After an election timeout passes, //! candidates will again bump their term and start a new election, until a //! leader can be established. To avoid frequent ties, nodes use different, //! randomized election timeouts (Raft paper section 5.2). //! //! Similarly, if a follower doesn't hear from a leader in an election timeout //! interval, it will become candidate and hold another election. The periodic //! leader heartbeats prevent this as long as the leader is running and //! connected. A node that becomes disconnected from the leader will continually //! hold new elections by itself until the network heals, at which point a new //! election will be held in its term (disrupting the current leader). //! //! REPLICATION AND CONSENSUS //! ========================= //! //! When the leader receives a client write request, it appends the command to //! its local log via `Log.append()`, and sends the log entry to all peers in //! a `Message::Append`. Followers will attempt to durably append the entry to //! their local logs and respond with `Message::AppendResponse`. //! //! Once a majority have acknowledged the append, the leader commits the entry //! via `Log.commit()` and applies it to its local state machine, returning the //! result to the client. It will inform followers about the commit in the next //! heartbeat as `Message::Heartbeat.commit_index` so they can apply it too, but //! this is not necessary for correctness (they will commit and apply it if they //! become leader, otherwise they have no need for applying it). //! //! Followers may not be able to append the entry to their log -- they may be //! unreachable, lag behind the leader, or have divergent logs (see Raft paper //! section 5.3). The `Append` contains the index and term of the log entry //! immediately before the replicated entry as `base_index` and `base_term`. An //! index/term pair uniquely identifies a command, and if two logs have the same //! index/term pair then the logs are identical up to and including that entry //! (Raft paper section 5.3). If the base index/term matches the follower's log, //! it appends the entry (potentially replacing any conflicting entries), //! otherwise it rejects it. //! //! When a follower rejects an append, the leader must try to find a common log //! entry that exists in both its and the follower's log where it can resume //! replication. It does this by sending `Message::Append` probes only //! containing a base index/term but no entries -- it will continue to probe //! decreasing indexes one by one until the follower responds with a match, then //! send an `Append` with the missing entries (Raft paper section 5.3). It keeps //! track of each follower's `match_index` and `next_index` in a `Progress` //! struct to manage this. //! //! In case `Append` messages or responses are lost, leaders also send their //! `last_index` and term in each `Heartbeat`. If followers don't have that //! index/term pair in their log, they'll say so in the `HeartbeatResponse` and //! the leader can begin probing their logs as with append rejections. //! //! CLIENT REQUESTS //! =============== //! //! Client requests are submitted as `Message::ClientRequest` to the local Raft //! node. They are only processed on the leader, but followers will proxy them //! to the leader (Raft thesis section 6.2). To avoid complications with message //! replays (Raft thesis section 6.3), requests are not retried internally, and //! are explicitly aborted with `Error::Abort` on leader/term changes as well as //! elections. //! //! Write requests, `Request::Write`, are appended to the Raft log and //! replicated. The leader keeps track of the request and its log index in a //! `Write` struct. Once the command is committed and applied to the local state //! machine, the leader looks up the write request by its log index and sends //! the result to the client. Deterministic errors (e.g. foreign key violations) //! are also returned to the client, but non-deterministic errors (e.g. IO //! errors) must panic the node to avoid state divergence. //! //! Read requests, `Request::Read`, are only executed on the leader and don't //! need to be replicated via the Raft log. However, to ensure linearizability, //! the leader has to confirm with a quorum that it's actually still the leader. //! Otherwise, it's possible that a new leader has been elected elsewhere and //! executed writes without us knowing about it. It does this by assigning an //! incrementing sequence number to each read, keeping track of the request in a //! `Read` struct, and immediately sending a `Read` message with the latest //! sequence number. Followers respond with the sequence number, and once a //! quorum have confirmed a sequence number the read is executed and the result //! returned to the client. //! //! IMPLEMENTATION CAVEATS //! ====================== //! //! For simplicity, toyDB implements the bare minimum for a functional and //! correct Raft protocol, and omits several advanced mechanisms that would be //! needed for a real production system. In particular: //! //! * No leases: for linearizability, every read request requires the leader to //! confirm with followers that it's still the leader. This could be avoided //! with a leader lease for a predefined time interval (Raft paper section 8, //! Raft thesis section 6.3). //! //! * No cluster membership changes: to add or remove nodes, the entire cluster //! must be stopped and restarted with the new configuration, otherwise it //! risks multiple leaders (Raft paper section 6). //! //! * No snapshots: new or lagging nodes must be caught up by replicating and //! replaying the entire log, instead of sending a state machine snapshot //! (Raft paper section 7). //! //! * No log truncation: because snapshots aren't supported, the entire Raft //! log must be retained forever in order to catch up new/lagging nodes, //! leading to excessive storage use (Raft paper section 7). //! //! * No pre-vote or check-quorum: a node that's partially partitioned (can //! reach some but not all nodes) can cause persistent unavailability with //! spurious elections or heartbeats. A node rejoining after a partition can //! also temporarily disrupt a leader. This requires additional pre-vote and //! check-quorum protocol extensions (Raft thesis section 4.2.3 and 9.6). //! //! * No request retries: client requests will not be retried on leader changes //! or message loss, and will be aggressively aborted, to ignore problems //! related to message replay (Raft thesis section 6.3). //! //! * No reject hints: if a follower has a divergent log, the leader will probe //! entries one by one until a match is found. The replication protocol could //! instead be extended with rejection hints (Raft paper section 5.3). mod log; mod message; mod node; mod state; use std::ops::Range; use std::time::Duration; pub use log::{Entry, Index, Key, Log}; pub use message::{Envelope, Message, ReadSequence, Request, RequestID, Response, Status}; pub use node::{Node, NodeID, Options, Term, Ticks}; pub use state::State; /// The interval between Raft ticks, the Raft unit of time. pub const TICK_INTERVAL: Duration = Duration::from_millis(100); /// The interval between leader heartbeats in ticks. const HEARTBEAT_INTERVAL: Ticks = 4; /// The default election timeout range in ticks. To avoid election ties, a node /// chooses a random value in this interval. const ELECTION_TIMEOUT_RANGE: Range = 10..20; /// The maximum number of log entries to send in a single append message. const MAX_APPEND_ENTRIES: usize = 100; ================================================ FILE: src/raft/node.rs ================================================ use std::cmp::{max, min}; use std::collections::{HashMap, HashSet, VecDeque}; use std::ops::Range; use crossbeam::channel::Sender; use itertools::Itertools as _; use log::{debug, info}; use rand::RngExt as _; use super::log::{Index, Log}; use super::message::{Envelope, Message, ReadSequence, Request, RequestID, Response, Status}; use super::state::State; use super::{ELECTION_TIMEOUT_RANGE, HEARTBEAT_INTERVAL, MAX_APPEND_ENTRIES}; use crate::errinput; use crate::error::{Error, Result}; /// A node ID, unique within a cluster. Assigned manually when started. pub type NodeID = u8; /// A leader term number. Increases monotonically on elections. pub type Term = u64; /// A logical clock interval as number of ticks. pub type Ticks = u8; /// Raft node options. #[derive(Clone, Debug, PartialEq)] pub struct Options { /// The number of ticks between leader heartbeats. pub heartbeat_interval: Ticks, /// The range of randomized election timeouts for followers and candidates. pub election_timeout_range: Range, /// Maximum number of entries to send in a single Append message. pub max_append_entries: usize, } impl Default for Options { fn default() -> Self { Self { heartbeat_interval: HEARTBEAT_INTERVAL, election_timeout_range: ELECTION_TIMEOUT_RANGE, max_append_entries: MAX_APPEND_ENTRIES, } } } /// A Raft node with a dynamic role. This implements the Raft distributed /// consensus protocol, see the `raft` module documentation for more info. /// /// The node is driven synchronously by processing inbound messages via `step()` /// and by advancing time via `tick()`. These methods consume the node and /// return a new one with a possibly different role. Outbound messages are sent /// via the given `tx` channel, and must be delivered to peers or clients. /// /// This enum is the public interface to the node, with a closed set of roles. /// It wraps the `RawNode` types, which implement the actual node logic. /// The enum allows ergonomic use across role transitions since it can represent /// all roles, e.g.: `node = node.step()?`. pub enum Node { /// A candidate campaigns for leadership. Candidate(RawNode), /// A follower replicates entries from a leader. Follower(RawNode), /// A leader processes client requests and replicates entries to followers. Leader(RawNode), } impl Node { /// Creates a new Raft node. It starts as a leaderless follower, waiting to /// hear from a leader or otherwise transitioning to candidate and /// campaigning for leadership. In the case of a single-node cluster (no /// peers), the node immediately transitions to leader when created. pub fn new( id: NodeID, peers: HashSet, log: Log, state: Box, tx: Sender, opts: Options, ) -> Result { let node = RawNode::new(id, peers, log, state, tx, opts)?; // If this is a single-node cluster, become leader immediately. if node.cluster_size() == 1 { return Ok(node.into_candidate()?.into_leader()?.into()); } Ok(node.into()) } /// Returns the node's ID. pub fn id(&self) -> NodeID { match self { Self::Candidate(node) => node.id, Self::Follower(node) => node.id, Self::Leader(node) => node.id, } } /// Returns the node's term. pub fn term(&self) -> Term { match self { Self::Candidate(node) => node.term(), Self::Follower(node) => node.term(), Self::Leader(node) => node.term(), } } /// Processes an inbound message. pub fn step(self, msg: Envelope) -> Result { let peers = match &self { Self::Candidate(node) => &node.peers, Self::Follower(node) => &node.peers, Self::Leader(node) => &node.peers, }; assert_eq!(msg.to, self.id(), "message to other node: {msg:?}"); assert!(peers.contains(&msg.from) || msg.from == self.id(), "unknown sender: {msg:?}"); debug!("Stepping {msg:?}"); match self { Self::Candidate(node) => node.step(msg), Self::Follower(node) => node.step(msg), Self::Leader(node) => node.step(msg), } } /// Advances time by a tick. pub fn tick(self) -> Result { match self { Self::Candidate(node) => node.tick(), Self::Follower(node) => node.tick(), Self::Leader(node) => node.tick(), } } } impl From> for Node { fn from(node: RawNode) -> Self { Node::Candidate(node) } } impl From> for Node { fn from(node: RawNode) -> Self { Node::Follower(node) } } impl From> for Node { fn from(node: RawNode) -> Self { Node::Leader(node) } } /// Marker trait for a Raft role: leader, follower, or candidate. pub trait Role {} /// A Raft node with role R. /// /// This implements the typestate pattern, where individual node states (roles) /// are encoded as RawNode. See http://cliffle.com/blog/rust-typestate/. pub struct RawNode { /// The node ID. Must be unique in the cluster. id: NodeID, /// The IDs of the other nodes in the cluster. Does not change while /// running. Can change on restart, but all nodes must have the same set of /// nodes, otherwise it can result in multiple leaders (split brain). peers: HashSet, /// The Raft log, which stores client commands to be executed. log: Log, /// The Raft state machine, which executes client commands from the log. state: Box, /// Channel for sending outbound messages to other nodes. tx: Sender, /// Node options. opts: Options, /// Role-specific state. role: R, } impl RawNode { /// Helper for role transitions. fn into_role(self, role: T) -> RawNode { RawNode { id: self.id, peers: self.peers, log: self.log, state: self.state, tx: self.tx, opts: self.opts, role, } } /// Returns the node's current term. fn term(&self) -> Term { self.log.get_term_vote().0 } /// Returns the cluster size as number of nodes. fn cluster_size(&self) -> usize { self.peers.len() + 1 } /// Returns the cluster quorum size (strict majority). fn quorum_size(&self) -> usize { self.cluster_size() / 2 + 1 } /// Returns the quorum value (i.e. median) of the given unsorted vector. It /// must have the same length as the cluster size. fn quorum_value(&self, mut values: Vec) -> T { assert_eq!(values.len(), self.cluster_size(), "vector size must match cluster size"); *values.select_nth_unstable_by(self.quorum_size() - 1, |a, b| a.cmp(b).reverse()).1 } /// Generates a random election timeout. fn random_election_timeout(&self) -> Ticks { rand::rng().random_range(self.opts.election_timeout_range.clone()) } /// Sends a message to the given recipient. fn send(&self, to: NodeID, message: Message) -> Result<()> { Self::send_via(&self.tx, Envelope { from: self.id, to, term: self.term(), message }) } /// Sends a message via the given channel. This avoid borrowing self, to /// allow sending while holding partial borrows of self. fn send_via(tx: &Sender, msg: Envelope) -> Result<()> { debug!("Sending {msg:?}"); Ok(tx.send(msg)?) } /// Broadcasts a message to all peers. fn broadcast(&self, message: Message) -> Result<()> { // Send in increasing ID order for test determinism. for id in self.peers.iter().copied().sorted() { self.send(id, message.clone())?; } Ok(()) } } /// A follower replicates log entries from a leader and forwards client requests /// to it. Nodes start as leaderless followers, until they either discover a /// leader or hold an election. pub struct Follower { /// The leader, or None if we're a leaderless follower. leader: Option, /// The number of ticks since the last message from the leader. leader_seen: Ticks, /// The leader_seen timeout before triggering an election. election_timeout: Ticks, // Local client requests that have been forwarded to the leader. These are // aborted on leader/term changes. forwarded: HashSet, } impl Follower { /// Creates a new follower role. fn new(leader: Option, election_timeout: Ticks) -> Self { Self { leader, leader_seen: 0, election_timeout, forwarded: HashSet::new() } } } impl Role for Follower {} impl RawNode { /// Creates a new node as a leaderless follower. fn new( id: NodeID, peers: HashSet, log: Log, state: Box, tx: Sender, opts: Options, ) -> Result { if peers.contains(&id) { return errinput!("node ID {id} can't be in peers"); } let role = Follower::new(None, 0); let mut node = Self { id, peers, log, state, tx, opts, role }; node.role.election_timeout = node.random_election_timeout(); // Apply any pending entries following restart. State machine writes are // not flushed to durable storage, so a tail of writes may be lost if // the host crashes or restarts. The Raft log is durable, so we can // always recover the state from it. We reapply any missing entries here // if that should happen. node.maybe_apply()?; Ok(node) } /// Transitions the follower into a candidate, by campaigning for /// leadership in a new term. fn into_candidate(mut self) -> Result> { // Abort any forwarded requests. These must be retried with new leader. self.abort_forwarded()?; // Apply any pending log entries, so that we're caught up if we win. self.maybe_apply()?; // Become candidate and campaign. let election_timeout = self.random_election_timeout(); let mut node = self.into_role(Candidate::new(election_timeout)); node.campaign()?; let (term, vote) = node.log.get_term_vote(); assert!(node.role.votes.contains(&node.id), "candidate did not vote for self"); assert_ne!(term, 0, "candidate can't have term 0"); assert_eq!(vote, Some(node.id), "log vote does not match self"); Ok(node) } /// Transitions the follower into either a leaderless follower in a new term /// (e.g. if someone holds a new election) or a follower of a current leader. fn into_follower(mut self, term: Term, leader: Option) -> Result> { assert_ne!(term, 0, "can't become follower in term 0"); // Abort any forwarded requests. These must be retried with new leader. self.abort_forwarded()?; if let Some(leader) = leader { // We found a leader in the current term. assert!(self.peers.contains(&leader), "leader is not a peer"); assert_eq!(self.role.leader, None, "already have leader in term"); assert_eq!(term, self.term(), "can't follow leader in different term"); info!("Following leader {leader} in term {term}"); self.role = Follower::new(Some(leader), self.role.election_timeout); } else { // We found a new term, but we don't know who the leader is yet. // We'll find out if we step a message from it. assert_ne!(term, self.term(), "can't become leaderless follower in current term"); info!("Discovered new term {term}"); self.log.set_term_vote(term, None)?; self.role = Follower::new(None, self.random_election_timeout()); } Ok(self) } /// Processes an inbound message. fn step(mut self, msg: Envelope) -> Result { // Past term: outdated peer, drop the message. if msg.term < self.term() { debug!("Dropping message from past term: {msg:?}"); return Ok(self.into()); } // Future term: newer leader or candidate, become leaderless follower // and step the message. if msg.term > self.term() { return self.into_follower(msg.term, None)?.step(msg); } // Record when we last saw a message from the leader (if any). if Some(msg.from) == self.role.leader { self.role.leader_seen = 0 } match msg.message { // The leader sends periodic heartbeats. If we don't have a leader // yet, follow it. If the commit_index advances, apply commands. Message::Heartbeat { last_index, commit_index, read_seq } => { assert!(commit_index <= last_index, "commit_index after last_index"); // Make sure the heartbeat is from our leader, or follow it. match self.role.leader { Some(leader) => assert_eq!(msg.from, leader, "multiple leaders in term"), None => self = self.into_follower(msg.term, Some(msg.from))?, } // Check if our log matches the leader's log up to last_index, // and respond to the heartbeat. last_index always has the // leader's term, since it only appends entries in its term. let match_index = if self.log.has(last_index, msg.term)? { last_index } else { 0 }; self.send(msg.from, Message::HeartbeatResponse { match_index, read_seq })?; // Advance the commit index and apply entries. We can only do // this if we matched the leader's last_index, which implies // that the logs are identical up to match_index. This also // implies that the commit_index is present in our log. if match_index != 0 && commit_index > self.log.get_commit_index().0 { self.log.commit(commit_index)?; self.maybe_apply()?; } } // Append log entries from the leader to the local log. Message::Append { base_index, base_term, entries } => { if let Some(first) = entries.first() { assert_eq!(base_index, first.index - 1, "base index mismatch"); } // Make sure the append is from our leader, or follow it. match self.role.leader { Some(leader) => assert_eq!(msg.from, leader, "multiple leaders in term"), None => self = self.into_follower(msg.term, Some(msg.from))?, } // If the base entry matches our log, append the entries. if base_index == 0 || self.log.has(base_index, base_term)? { let match_index = entries.last().map(|e| e.index).unwrap_or(base_index); self.log.splice(entries)?; self.send(msg.from, Message::AppendResponse { match_index, reject_index: 0 })?; } else { // Otherwise, reject the base index. If the local log is // shorter than the base index, lower the reject index to // skip all missing entries. let reject_index = min(base_index, self.log.get_last_index().0 + 1); self.send(msg.from, Message::AppendResponse { reject_index, match_index: 0 })?; } } // Confirm the leader's read sequence number. Message::Read { seq } => { // Make sure the read is from our leader, or follow it. match self.role.leader { Some(leader) => assert_eq!(msg.from, leader, "multiple leaders in term"), None => self = self.into_follower(msg.term, Some(msg.from))?, } // Confirm the read. self.send(msg.from, Message::ReadResponse { seq })?; } // A candidate is requesting our vote. We only grant one per term. Message::Campaign { last_index, last_term } => { // Don't vote if we already voted for someone else in this term. // We can repeat our vote for the same node though. if let (_, Some(vote)) = self.log.get_term_vote() && msg.from != vote { self.send(msg.from, Message::CampaignResponse { vote: false })?; return Ok(self.into()); } // Only vote if the candidate's log is at least as long as ours. // At least one node in any quorum must have all committed // entries, and this ensures we'll only elect a leader that has // all committed entries. See section 5.4.1 in the Raft paper. let (log_index, log_term) = self.log.get_last_index(); if log_term > last_term || log_term == last_term && log_index > last_index { self.send(msg.from, Message::CampaignResponse { vote: false })?; return Ok(self.into()); } // Grant the vote. info!("Voting for {} in term {} election", msg.from, msg.term); self.log.set_term_vote(msg.term, Some(msg.from))?; self.send(msg.from, Message::CampaignResponse { vote: true })?; } // Forward client requests to the leader, or abort them if there is // none. These will not be retried, the client should use timeouts // instead. Local client requests use our node ID as the sender. Message::ClientRequest { id, request: _ } => { assert_eq!(msg.from, self.id, "client request from other node"); if let Some(leader) = self.role.leader { debug!("Forwarding request to leader {leader}: {msg:?}"); self.role.forwarded.insert(id); self.send(leader, msg.message)? } else { let response = Err(Error::Abort); self.send(msg.from, Message::ClientResponse { id, response })? } } // Client responses from the leader are passed on to the client. Message::ClientResponse { id, response } => { assert_eq!(Some(msg.from), self.role.leader, "client response from non-leader"); if self.role.forwarded.remove(&id) { self.send(self.id, Message::ClientResponse { id, response })?; } } // We may receive a vote after we lost an election, ignore it. Message::CampaignResponse { .. } => {} // We're not leader this term, so we shouldn't see these. Message::HeartbeatResponse { .. } | Message::AppendResponse { .. } | Message::ReadResponse { .. } => { panic!("follower received unexpected message {msg:?}") } }; Ok(self.into()) } /// Processes a logical clock tick. fn tick(mut self) -> Result { // Campaign if we haven't heard from the leader in a while. self.role.leader_seen += 1; if self.role.leader_seen >= self.role.election_timeout { return Ok(self.into_candidate()?.into()); } Ok(self.into()) } /// Aborts all forwarded requests (e.g. on term/leader changes). fn abort_forwarded(&mut self) -> Result<()> { // Sort by ID for test determinism. for id in std::mem::take(&mut self.role.forwarded).into_iter().sorted() { debug!("Aborting forwarded request {id}"); self.send(self.id, Message::ClientResponse { id, response: Err(Error::Abort) })?; } Ok(()) } /// Applies any pending log entries. fn maybe_apply(&mut self) -> Result<()> { let mut iter = self.log.scan_apply(self.state.get_applied_index()); while let Some(entry) = iter.next().transpose()? { debug!("Applying {entry:?}"); // Throw away the result, since only the leader responds to clients. // This includes errors -- any non-deterministic errors (e.g. IO // errors) must panic instead to avoid node divergence. _ = self.state.apply(entry); } Ok(()) } } /// A candidate is campaigning to become a leader. pub struct Candidate { /// Votes received (including our own). votes: HashSet, /// Ticks elapsed since election start. election_duration: Ticks, /// Election timeout, in ticks. election_timeout: Ticks, } impl Candidate { /// Creates a new candidate role. fn new(election_timeout: Ticks) -> Self { Self { votes: HashSet::new(), election_duration: 0, election_timeout } } } impl Role for Candidate {} impl RawNode { /// Transitions the candidate to a follower. We either lost the election and /// follow the winner, or we discovered a new term and step into it as a /// leaderless follower. fn into_follower(mut self, term: Term, leader: Option) -> Result> { let election_timeout = self.random_election_timeout(); if let Some(leader) = leader { // We lost the election, follow the winner. assert_eq!(term, self.term(), "can't follow leader in different term"); info!("Lost election, following leader {leader} in term {term}"); Ok(self.into_role(Follower::new(Some(leader), election_timeout))) } else { // We found a new term, but we don't necessarily know who the leader // is yet. We'll find out when we step a message from it. assert_ne!(term, self.term(), "can't become leaderless follower in current term"); info!("Discovered new term {term}"); self.log.set_term_vote(term, None)?; Ok(self.into_role(Follower::new(None, election_timeout))) } } /// Transitions the candidate to a leader. We won the election. fn into_leader(self) -> Result> { let (term, vote) = self.log.get_term_vote(); assert_ne!(term, 0, "leaders can't have term 0"); assert_eq!(vote, Some(self.id), "leader did not vote for self"); info!("Won election for term {term}, becoming leader"); let peers = self.peers.clone(); let (last_index, _) = self.log.get_last_index(); let mut node = self.into_role(Leader::new(peers, last_index)); // Propose an empty command when assuming leadership, to disambiguate // previous entries in the log. See section 5.4.2 in the Raft paper. // We do this prior to the heartbeat, to avoid a wasted replication // roundtrip if the heartbeat response indicates the peer is behind. node.propose(None)?; node.maybe_commit_and_apply()?; node.heartbeat()?; Ok(node) } /// Processes an inbound message. fn step(mut self, msg: Envelope) -> Result { // Past term: outdated peer, drop the message. if msg.term < self.term() { debug!("Dropping message from past term: {msg:?}"); return Ok(self.into()); } // Future term: newer leader or candidate, become leaderless follower // and step the message. if msg.term > self.term() { return self.into_follower(msg.term, None)?.step(msg); } match msg.message { // If we received a vote, record it. If the vote gives us quorum, // assume leadership. Message::CampaignResponse { vote: true } => { self.role.votes.insert(msg.from); if self.role.votes.len() >= self.quorum_size() { return Ok(self.into_leader()?.into()); } } // We didn't get the vote. :( Message::CampaignResponse { vote: false } => {} // Don't grant votes for other candidates. Message::Campaign { .. } => { self.send(msg.from, Message::CampaignResponse { vote: false })? } // If we hear from a leader in this term, we lost the election. // Follow it and step the message. Message::Heartbeat { .. } | Message::Append { .. } | Message::Read { .. } => { return self.into_follower(msg.term, Some(msg.from))?.step(msg); } // Abort client requests while campaigning. The client must retry. Message::ClientRequest { id, request: _ } => { self.send(msg.from, Message::ClientResponse { id, response: Err(Error::Abort) })?; } // We're not a leader in this term, nor are we forwarding requests, // so we shouldn't see these. Message::HeartbeatResponse { .. } | Message::AppendResponse { .. } | Message::ReadResponse { .. } | Message::ClientResponse { .. } => panic!("unexpected message {msg:?}"), } Ok(self.into()) } /// Processes a logical clock tick. fn tick(mut self) -> Result { // If noone won this election, start a new one after a while. self.role.election_duration += 1; if self.role.election_duration >= self.role.election_timeout { self.campaign()?; } Ok(self.into()) } /// Hold a new election by increasing the term, voting for ourself, and /// soliciting votes from all peers. fn campaign(&mut self) -> Result<()> { let term = self.term() + 1; info!("Starting new election for term {term}"); self.role = Candidate::new(self.random_election_timeout()); self.role.votes.insert(self.id); // vote for ourself self.log.set_term_vote(term, Some(self.id))?; let (last_index, last_term) = self.log.get_last_index(); self.broadcast(Message::Campaign { last_index, last_term }) } } /// A leader serves client requests and replicates the log to followers. /// If the leader loses leadership, all client requests are aborted. pub struct Leader { /// Follower replication progress. progress: HashMap, /// Tracks pending write requests by log index. Added when the write is /// proposed and appended to the leader's log, and removed when the command /// is applied to the state machine, returning the result to the client. writes: HashMap, /// Tracks pending read requests. For linearizability, read requests are /// assigned a sequence number and only executed once a quorum of nodes have /// confirmed that we're still the leader. Otherwise, an old leader could /// serve stale reads if a new leader has been elected elsewhere. reads: VecDeque, /// The read sequence number used for the last read. Initialized to 0 in /// this term, and incremented for every read command. read_seq: ReadSequence, /// Number of ticks since last heartbeat. since_heartbeat: Ticks, } /// Per-follower replication progress (in this term). struct Progress { /// The highest index where the follower's log is known to match the leader. /// Initialized to 0, increases monotonically. match_index: Index, /// The next index to replicate to the follower. Initialized to /// last_index+1, decreased when probing log mismatches. Always in /// the range [match_index+1, last_index+1]. /// /// Entries not yet sent are in the range [next_index, last_index]. /// Entries not acknowledged are in the range [match_index+1, next_index). next_index: Index, /// The last read sequence number confirmed by this follower. To avoid stale /// reads on leader changes, a read is only served once its sequence number /// is confirmed by a quorum. read_seq: ReadSequence, } impl Progress { /// Attempts to advance a follower's match index, returning true if it did. /// If next_index is below it, it is advanced to the following index. fn advance(&mut self, match_index: Index) -> bool { if match_index <= self.match_index { return false; } self.match_index = match_index; self.next_index = max(self.next_index, match_index + 1); true } /// Attempts to advance a follower's read_seq, returning true if it did. fn advance_read(&mut self, read_seq: ReadSequence) -> bool { if read_seq <= self.read_seq { return false; } self.read_seq = read_seq; true } /// Attempts to regress a follower's next index to the given index, returning /// true if it did. Won't regress below match_index + 1. fn regress_next(&mut self, next_index: Index) -> bool { if next_index >= self.next_index || self.next_index <= self.match_index + 1 { return false; } self.next_index = max(next_index, self.match_index + 1); true } } /// A pending client write request. struct Write { /// The node which submitted the write. from: NodeID, /// The write request ID. id: RequestID, } /// A pending client read request. struct Read { /// The sequence number of this read. seq: ReadSequence, /// The node which submitted the read. from: NodeID, /// The read request ID. id: RequestID, /// The read command. command: Vec, } impl Leader { /// Creates a new leader role. fn new(peers: HashSet, last_index: Index) -> Self { let next_index = last_index + 1; let progress = peers .into_iter() .map(|p| (p, Progress { next_index, match_index: 0, read_seq: 0 })) .collect(); Self { progress, writes: HashMap::new(), reads: VecDeque::new(), read_seq: 0, since_heartbeat: 0, } } } impl Role for Leader {} impl RawNode { /// Transitions the leader into a follower. This can only happen if we /// discover a new term, so we become a leaderless follower. Stepping the /// received message may then follow a new leader, if there is one. fn into_follower(mut self, term: Term) -> Result> { assert!(term > self.term(), "leader can only become follower in later term"); info!("Discovered new term {term}"); // Abort in-flight requests. The client must retry. Sort the requests // by ID for test determinism. for write in std::mem::take(&mut self.role.writes).into_values().sorted_by_key(|w| w.id) { let response = Err(Error::Abort); self.send(write.from, Message::ClientResponse { id: write.id, response })?; } for read in std::mem::take(&mut self.role.reads).into_iter().sorted_by_key(|r| r.id) { let response = Err(Error::Abort); self.send(read.from, Message::ClientResponse { id: read.id, response })?; } self.log.set_term_vote(term, None)?; let election_timeout = self.random_election_timeout(); Ok(self.into_role(Follower::new(None, election_timeout))) } /// Processes an inbound message. fn step(mut self, msg: Envelope) -> Result { // Past term: outdated peer, drop the message. if msg.term < self.term() { debug!("Dropping message from past term: {msg:?}"); return Ok(self.into()); } // Future term: become leaderless follower and step the message. if msg.term > self.term() { return self.into_follower(msg.term)?.step(msg); } match msg.message { // A follower received our heartbeat and confirms our leadership. // We may be able to execute new reads, and we may find that the // follower's log is lagging and requires us to catch it up. Message::HeartbeatResponse { match_index, read_seq } => { let (last_index, _) = self.log.get_last_index(); assert!(match_index <= last_index, "future match index"); assert!(read_seq <= self.role.read_seq, "future read sequence number"); // If the read sequence number advances, try to execute reads. if self.progress(msg.from).advance_read(read_seq) { self.maybe_read()?; } // If the follower didn't match our last index, an append to it // must have failed (or it's catching up). Probe it to discover // a matching entry and start replicating. Move next_index back // to last_index since the follower just told us it doesn't have // it (or a previous last_index). if match_index == 0 { self.progress(msg.from).regress_next(last_index); self.maybe_send_append(msg.from, true)?; } // If the follower's match index advances, an append response // got lost. Try to commit and apply. // // We don't need to eagerly send any pending entries, since any // proposals made after this heartbeat was sent should have been // eagerly replicated in steady state. If not, the next // heartbeat will trigger a probe above. if self.progress(msg.from).advance(match_index) { self.maybe_commit_and_apply()?; } } // A follower appended our log entries (or a probe found a match). // Record its progress and attempt to commit and apply. Message::AppendResponse { match_index, reject_index: 0 } if match_index > 0 => { let (last_index, _) = self.log.get_last_index(); assert!(match_index <= last_index, "future match index"); if self.progress(msg.from).advance(match_index) { self.maybe_commit_and_apply()?; } // Eagerly send any further pending entries. This may be a // successful probe response, or the peer may be lagging and // we're catching it up one MAX_APPEND_ENTRIES batch at a time. self.maybe_send_append(msg.from, false)?; } // A follower confirmed our read sequence number. If it advances, // try to execute reads. Message::ReadResponse { seq } => { if self.progress(msg.from).advance_read(seq) { self.maybe_read()?; } } // A follower rejected an append because the base entry in // reject_index did not match its log. Probe the previous entry by // sending an empty append until we find a common base. // // This linear probing can be slow with long divergent logs, but we // keep it simple. See also section 5.3 in the Raft paper. Message::AppendResponse { reject_index, match_index: 0 } if reject_index > 0 => { let (last_index, _) = self.log.get_last_index(); assert!(reject_index <= last_index, "future reject index"); // If the rejected base index is at or below the match index, // the rejection is stale and can be ignored. if reject_index <= self.progress(msg.from).match_index { return Ok(self.into()); } // Probe below the reject index, if we haven't already moved // next_index below it. This avoids sending duplicate probes // (heartbeats will trigger retries if they're lost). if self.progress(msg.from).regress_next(reject_index) { self.maybe_send_append(msg.from, true)?; } } // AppendResponses must set either match_index or reject_index. Message::AppendResponse { .. } => panic!("invalid message {msg:?}"), // A client submitted a write request. Propose it, and wait until // it's replicated and applied to the state machine before returning // the response to the client. Message::ClientRequest { id, request: Request::Write(command) } => { let index = self.propose(Some(command))?; self.role.writes.insert(index, Write { from: msg.from, id }); if self.cluster_size() == 1 { self.maybe_commit_and_apply()?; } } // A client submitted a read request. To ensure linearizability, we // must confirm that we are still the leader by sending the read's // sequence number and wait for quorum confirmation. Message::ClientRequest { id, request: Request::Read(command) } => { self.role.read_seq += 1; let read = Read { seq: self.role.read_seq, from: msg.from, id, command }; self.role.reads.push_back(read); self.broadcast(Message::Read { seq: self.role.read_seq })?; if self.cluster_size() == 1 { self.maybe_read()?; } } // A client submitted a status command. Message::ClientRequest { id, request: Request::Status } => { let response = self.status().map(Response::Status); self.send(msg.from, Message::ClientResponse { id, response })?; } // Don't grant any votes (we've already voted for ourself). Message::Campaign { .. } => { self.send(msg.from, Message::CampaignResponse { vote: false })? } // Votes can come in after we won the election, ignore them. Message::CampaignResponse { .. } => {} // There can't be another leader in this term. Message::Heartbeat { .. } | Message::Append { .. } | Message::Read { .. } => { panic!("saw other leader {} in term {}", msg.from, msg.term); } // Leaders don't proxy client requests. Message::ClientResponse { .. } => panic!("unexpected message {msg:?}"), } Ok(self.into()) } /// Processes a logical clock tick. fn tick(mut self) -> Result { // Send periodic heartbeats. self.role.since_heartbeat += 1; if self.role.since_heartbeat >= self.opts.heartbeat_interval { self.heartbeat()?; } Ok(self.into()) } /// Broadcasts a heartbeat to all peers. fn heartbeat(&mut self) -> Result<()> { let (last_index, last_term) = self.log.get_last_index(); let (commit_index, _) = self.log.get_commit_index(); let read_seq = self.role.read_seq; assert_eq!(last_term, self.term(), "leader's last_term not in current term"); self.role.since_heartbeat = 0; self.broadcast(Message::Heartbeat { last_index, commit_index, read_seq }) } /// Proposes a command for consensus by appending it to our log and /// replicating it to peers. If successful, it will eventually be committed /// and applied to the state machine. fn propose(&mut self, command: Option>) -> Result { let index = self.log.append(command)?; for peer in self.peers.iter().copied().sorted() { // Eagerly send the entry to the peer if it's in steady state and // we've sent all previous entries. Otherwise, the peer is lagging // and we're probing past entries for a match. if index == self.progress(peer).next_index { self.maybe_send_append(peer, false)?; } } Ok(index) } /// Commits new entries that have been replicated to a quorum and applies /// them to the state machine, returning results to clients. fn maybe_commit_and_apply(&mut self) -> Result { // Determine the new commit index by quorum. let (last_index, _) = self.log.get_last_index(); let commit_index = self.quorum_value( self.role.progress.values().map(|p| p.match_index).chain([last_index]).collect(), ); // If the commit index doesn't advance, do nothing. We don't assert on // this, since the quorum value may regress e.g. following a restart or // leader change where followers are initialized with match index 0. let (old_index, old_term) = self.log.get_commit_index(); if commit_index <= old_index { return Ok(old_index); } // We can only safely commit an entry from our own term (see section // 5.4.2 in Raft paper). match self.log.get(commit_index)? { Some(entry) if entry.term == self.term() => {} Some(_) => return Ok(old_index), None => panic!("commit index {commit_index} missing"), } // Commit entries. self.log.commit(commit_index)?; // Apply entries and respond to clients. let term = self.term(); let mut iter = self.log.scan_apply(self.state.get_applied_index()); while let Some(entry) = iter.next().transpose()? { debug!("Applying {entry:?}"); let write = self.role.writes.remove(&entry.index); let result = self.state.apply(entry); if let Some(Write { id, from: to }) = write { let message = Message::ClientResponse { id, response: result.map(Response::Write) }; Self::send_via(&self.tx, Envelope { from: self.id, term, to, message })?; } } drop(iter); // If the commit term changed, there may be pending reads waiting for us // to commit and apply an entry from our own term. Execute them. if old_term != self.term() { self.maybe_read()?; } Ok(commit_index) } /// Executes any ready read requests, where a quorum have confirmed that /// we're still the leader for the read sequences. fn maybe_read(&mut self) -> Result<()> { if self.role.reads.is_empty() { return Ok(()); } // It's only safe to read if we've committed and applied an entry from // our own term (the leader appends an entry when elected). Otherwise we // may be behind on application and serve stale reads. let (commit_index, commit_term) = self.log.get_commit_index(); let applied_index = self.state.get_applied_index(); if commit_term < self.term() || applied_index < commit_index { return Ok(()); } // Determine the maximum read sequence confirmed by quorum. let quorum_read_seq = self.quorum_value( self.role.progress.values().map(|p| p.read_seq).chain([self.role.read_seq]).collect(), ); // Execute ready reads. The VecDeque is ordered by read_seq, so we // can keep pulling until we hit quorum_read_seq. while let Some(read) = self.role.reads.front() { if read.seq > quorum_read_seq { break; } let read = self.role.reads.pop_front().unwrap(); let response = self.state.read(read.command).map(Response::Read); self.send(read.from, Message::ClientResponse { id: read.id, response })?; } Ok(()) } /// Sends a batch of pending log entries to a follower, in the /// [next_index,last_index] range. Limited by max_append_entries. /// /// If probe is true, we're trying to find a log index on the follower where /// it matches our log. To do this, we send an empty append probe with /// base_index of next_index-1. If the follower confirms the base_index /// matches its log, the actual entries are sent next -- otherwise, /// next_index is decremented and another probe is sent until a match is /// found. See section 5.3 in the Raft paper. /// /// The probe is skipped if the follower is up-to-date (according to /// match_index and last_index). If the probe's base_index has already been /// confirmed via match_index, an actual append is sent instead. fn maybe_send_append(&mut self, peer: NodeID, mut probe: bool) -> Result<()> { let (last_index, _) = self.log.get_last_index(); let progress = self.role.progress.get_mut(&peer).expect("unknown node"); assert_ne!(progress.next_index, 0, "invalid next_index"); assert!(progress.next_index > progress.match_index, "invalid next_index <= match_index"); assert!(progress.match_index <= last_index, "invalid match_index > last_index"); assert!(progress.next_index <= last_index + 1, "invalid next_index > last_index + 1"); // If the peer is caught up, there's no point sending an append. if progress.match_index == last_index { return Ok(()); } // If a probe was requested, but the base_index has already been // confirmed via match_index, there is no point in probing. Just send // the entries instead. probe = probe && progress.next_index > progress.match_index + 1; // If there are no pending entries, and this is not a probe, there's // nothing more to send until we get a response from the follower. if progress.next_index > last_index && !probe { return Ok(()); } // Fetch the base and entries. let (base_index, base_term) = match progress.next_index { 0 => panic!("next_index=0 for node {peer}"), 1 => (0, 0), // first entry, there is no base next => self.log.get(next - 1)?.map(|e| (e.index, e.term)).expect("missing base entry"), }; let entries = match probe { false => self .log .scan(progress.next_index..) .take(self.opts.max_append_entries) .try_collect()?, true => Vec::new(), }; // Optimistically assume the entries will be accepted by the follower, // and bump next_index to avoid resending them until a response. if let Some(last) = entries.last() { progress.next_index = last.index + 1; } debug!("Replicating {} entries with base {base_index} to {peer}", entries.len()); self.send(peer, Message::Append { base_index, base_term, entries }) } /// Generates cluster status. fn status(&mut self) -> Result { Ok(Status { leader: self.id, term: self.term(), match_index: self .role .progress .iter() .map(|(id, p)| (*id, p.match_index)) .chain(std::iter::once((self.id, self.log.get_last_index().0))) .collect(), commit_index: self.log.get_commit_index().0, applied_index: self.state.get_applied_index(), storage: self.log.status()?, }) } /// Returns a mutable borrow of a node's progress. Convenience method. fn progress(&mut self, id: NodeID) -> &mut Progress { self.role.progress.get_mut(&id).expect("unknown node") } } /// Most Raft tests are Goldenscripts under src/raft/testscripts. #[cfg(test)] mod tests { use std::borrow::Borrow; use std::error::Error; use std::fmt::Write as _; use std::path::Path; use std::result::Result; use crossbeam::channel::Receiver; use tempfile::TempDir; use test_case::test_case; use test_each_file::test_each_path; use uuid::Uuid; use super::*; use crate::encoding::{Key as _, Value as _, bincode}; use crate::raft::Entry; use crate::raft::state::test::{self as teststate, KVCommand, KVResponse}; use crate::storage; use crate::storage::engine::test as testengine; // Run goldenscript tests in src/raft/testscripts/node. test_each_path! { in "src/raft/testscripts/node" as scripts => test_goldenscript } fn test_goldenscript(path: &Path) { goldenscript::run(&mut TestRunner::new(), path).expect("goldenscript failed") } /// Tests RawNode.quorum_size() and cluster_size(). #[test_case(1 => 1)] #[test_case(2 => 2)] #[test_case(3 => 2)] #[test_case(4 => 3)] #[test_case(5 => 3)] #[test_case(6 => 4)] #[test_case(7 => 4)] #[test_case(8 => 5)] fn quorum_size(size: usize) -> usize { let node = RawNode::new_noop(1, (2..=size as NodeID).collect()); assert_eq!(node.cluster_size(), size); node.quorum_size() } /// Tests RawNode.quorum_value(). #[test_case(vec![1] => 1)] #[test_case(vec![1,3,2] => 2)] #[test_case(vec![4,1,3,2] => 2)] #[test_case(vec![1,1,1,2,2] => 1)] #[test_case(vec![1,1,2,2,2] => 2)] fn quorum_value(values: Vec) -> i8 { let size = values.len(); let node = RawNode::new_noop(1, (2..=size as NodeID).collect()); assert_eq!(node.cluster_size(), size); node.quorum_value(values) } /// Test helpers for RawNode. impl RawNode { /// Creates a noop node, with a noop state machine and transport. fn new_noop(id: NodeID, peers: HashSet) -> Self { let log = Log::new(Box::new(storage::Memory::new())).expect("log failed"); let state = teststate::Noop::new(); let (tx, _) = crossbeam::channel::unbounded(); RawNode::new(id, peers, log, state, tx, Options::default()).expect("node failed") } } /// Helper macro which calls a closure on the inner RawNode. macro_rules! with_rawnode { // Node is moved. ($node:expr, $closure:expr) => {{ fn with(node: RawNode, f: impl FnOnce(RawNode) -> T) -> T { f(node) } match $node { Node::Candidate(node) => with(node, $closure), Node::Follower(node) => with(node, $closure), Node::Leader(node) => with(node, $closure), } }}; // Node is borrowed (ref). (ref $node:expr, $closure:expr) => {{ fn with(node: &RawNode, f: impl FnOnce(&RawNode) -> T) -> T { f(node) } match $node { &Node::Candidate(ref node) => with(node, $closure), &Node::Follower(ref node) => with(node, $closure), &Node::Leader(ref node) => with(node, $closure), } }}; // Node is mutably borrowed (ref mut). (ref mut $node:expr, $closure:expr) => {{ fn with(node: &mut RawNode, f: impl FnOnce(&mut RawNode) -> T) -> T { f(node) } match $node { &mut Node::Candidate(ref mut node) => with(node, $closure), &mut Node::Follower(ref mut node) => with(node, $closure), &mut Node::Leader(ref mut node) => with(node, $closure), } }}; } /// Test helpers for Node. impl Node { fn dismantle(self) -> (Log, Box) { with_rawnode!(self, |n| (n.log, n.state)) } fn get_applied_index(&self) -> Index { with_rawnode!(ref self, |n| n.state.get_applied_index()) } fn get_commit_index(&self) -> (Index, Term) { with_rawnode!(ref self, |n| n.log.get_commit_index()) } fn get_last_index(&self) -> (Index, Term) { with_rawnode!(ref self, |n| n.log.get_last_index()) } fn get_term_vote(&self) -> (Term, Option) { with_rawnode!(ref self, |n| n.log.get_term_vote()) } fn options(&self) -> Options { with_rawnode!(ref self, |n| n.opts.clone()) } fn peers(&self) -> HashSet { with_rawnode!(ref self, |n| n.peers.clone()) } fn read(&self, command: Vec) -> crate::error::Result> { with_rawnode!(ref self, |n| n.state.read(command)) } fn scan_log(&mut self) -> crate::error::Result> { with_rawnode!(ref mut self, |n| n.log.scan(..).collect()) } } /// Runs Raft goldenscript tests. See run() for available commands. struct TestRunner { /// IDs of all cluster nodes, in order. ids: Vec, /// The cluster nodes, keyed by node ID. nodes: HashMap, /// Outbound send queues from each node. nodes_rx: HashMap>, /// Inbound receive queues to each node, to be stepped. nodes_pending: HashMap>, /// Applied log entries for each node, after state machine application. applied_rx: HashMap>, /// Network partitions (sender → receivers). A symmetric (bidirectional) /// partition needs an entry from each side. disconnected: HashMap>, /// In-flight client requests. requests: HashMap, /// The request ID to use for the next client request. next_request_id: u64, /// Temporary directory (deleted when dropped). tempdir: TempDir, } impl goldenscript::Runner for TestRunner { /// Runs a goldenscript command. fn run(&mut self, command: &goldenscript::Command) -> Result> { let mut output = String::new(); match command.name.as_str() { // campaign [ID...] // Transition the given nodes to candidates and campaign. "campaign" => { let ids = self.parse_ids_or_all(&command.args)?; self.campaign(&ids, &mut output)?; } // cluster nodes=N [leader=ID] [heartbeat_interval=N] [election_timeout=N] [max_append_entries=N] // Creates a new Raft cluster. "cluster" => { let mut opts = Options::default(); let mut args = command.consume_args(); let nodes = args.lookup_parse("nodes")?.unwrap_or(0); let leader = args.lookup_parse("leader")?; if let Some(heartbeat_interval) = args.lookup_parse("heartbeat_interval")? { opts.heartbeat_interval = heartbeat_interval; }; if let Some(election_timeout) = args.lookup_parse("election_timeout")? { opts.election_timeout_range = election_timeout..election_timeout + 1; } if let Some(max_append_entries) = args.lookup_parse("max_append_entries")? { opts.max_append_entries = max_append_entries; } args.reject_rest()?; self.cluster(nodes, leader, opts, &mut output)?; } // deliver [from=ID] [ID...] // Delivers (steps) pending messages to the given nodes. If from // is given, only messages from the given node is delivered, the // others are left pending. "deliver" => { let mut args = command.consume_args(); let from = args.lookup_parse("from")?; let ids = self.parse_ids_or_all(&args.rest())?; self.deliver(&ids, from, &mut output)?; } // get ID KEY // Sends a client request to the given node to read the given // key from the state machine (key/value store). "get" => { let mut args = command.consume_args(); let id = args.next_pos().ok_or("must specify node ID")?.parse()?; let key = args.next_pos().ok_or("must specify key")?.value.clone(); args.reject_rest()?; let request = Request::Read(KVCommand::Get { key }.encode()); self.request(id, request, &mut output)?; } // heal [ID...] // Heals all network partitions for the given nodes. "heal" => { let ids = self.parse_ids_or_all(&command.args)?; self.heal(&ids, &mut output)?; } // heartbeat ID... // Sends a heartbeat from the given leader nodes. "heartbeat" => { let ids = self.parse_ids_or_all(&command.args)?; self.heartbeat(&ids, &mut output)?; } // log [ID...] // Outputs the current Raft log for the given nodes. "log" => { let ids = self.parse_ids_or_all(&command.args)?; self.log(&ids, &mut output)?; } // partition ID... // Partitions the given nodes away from the rest of the cluster. // They can still communicate with each other, unless they were // previously partitioned. "partition" => { let ids = self.parse_ids_or_error(&command.args)?; self.partition(&ids, &mut output)?; } // put ID KEY=VALUE // Sends a client request to the given node to write a key/value // pair to the state machine (key/value store). "put" => { let mut args = command.consume_args(); let id = args.next_pos().ok_or("must specify node ID")?.parse()?; let kv = args.next_key().ok_or("must specify key/value pair")?.clone(); let (key, value) = (kv.key.unwrap(), kv.value); args.reject_rest()?; let request = Request::Write(KVCommand::Put { key, value }.encode()); self.request(id, request, &mut output)?; } // restart [commit_index=INDEX] [applied_index=INDEX] [ID...] // Restarts the given nodes (or all nodes). They retain their // log and state, unless applied_index is given (which reverts // the state machine to the given index, or 0 if empty). // commit_index may be given to regress the commit index (it // is not flushed to durable storage). "restart" => { let mut args = command.consume_args(); let applied_index = args.lookup_parse("applied_index")?; let commit_index = args.lookup_parse("commit_index")?; let ids = self.parse_ids_or_all(&args.rest())?; self.restart(&ids, commit_index, applied_index, &mut output)?; } // stabilize [heartbeat=BOOL] [ID...] // Stabilizes the given nodes by repeatedly delivering messages // until no more messages are pending. If heartbeat is true, also // emits a heartbeat from the leader and restabilizes, e.g. to // propagate the commit index. "stabilize" => { let mut args = command.consume_args(); let heartbeat = args.lookup_parse("heartbeat")?.unwrap_or(false); let ids = self.parse_ids_or_all(&args.rest())?; self.stabilize(&ids, heartbeat, &mut output)?; } // state [ID...] // Prints the current state machine contents on the given nodes. "state" => { let ids = self.parse_ids_or_all(&command.args)?; self.state(&ids, &mut output)?; } // status [request=BOOL] [ID...] // Prints the current node status of the given nodes. If request // is true, sends a status client request to a single node, // otherwise fetches status directly from each node. "status" => { let mut args = command.consume_args(); let request = args.lookup_parse("request")?.unwrap_or(false); let ids = self.parse_ids_or_all(&args.rest())?; if request { let [id] = *ids.as_slice() else { return Err("request=true requires 1 node ID".into()); }; self.request(id, Request::Status, &mut output)?; } else { self.status(&ids, &mut output)?; } } // step ID JSON // Steps a manually generated JSON message on the given node. "step" => { let mut args = command.consume_args(); let id = args.next_pos().ok_or("node ID not given")?.parse()?; let raw = &args.next_pos().ok_or("message not given")?.value; let msg = serde_json::from_str(raw)?; args.reject_rest()?; self.transition(id, |n| n.step(msg), &mut output)?; } // tick [ID...] // Ticks the given nodes. "tick" => { let ids = self.parse_ids_or_all(&command.args)?; for id in ids { self.transition(id, |n| n.tick(), &mut output)?; } } name => return Err(format!("unknown command {name}").into()), } Ok(output) } } impl TestRunner { fn new() -> Self { Self { ids: Vec::new(), nodes: HashMap::new(), nodes_rx: HashMap::new(), nodes_pending: HashMap::new(), applied_rx: HashMap::new(), disconnected: HashMap::new(), requests: HashMap::new(), next_request_id: 1, tempdir: TempDir::with_prefix("toydb").expect("tempdir failed"), } } /// Creates a new empty node and inserts it. fn add_node( &mut self, id: NodeID, peers: HashSet, opts: Options, ) -> Result<(), Box> { // Use both a BitCask and a Memory engine, and mirror operations // across them, for added engine test coverage. let path = self.tempdir.path().join(format!("{id}.log")); let bitcask = storage::BitCask::new(path).expect("bitcask failed"); let memory = storage::Memory::new(); let engine = testengine::Mirror::new(bitcask, memory); let log = Log::new(Box::new(engine))?; let state = teststate::KV::new(); self.add_node_with(id, peers, log, state, opts) } /// Creates a new node with the given log and state and inserts it. fn add_node_with( &mut self, id: NodeID, peers: HashSet, log: Log, state: Box, opts: Options, ) -> Result<(), Box> { let (node_tx, node_rx) = crossbeam::channel::unbounded(); let (applied_tx, applied_rx) = crossbeam::channel::unbounded(); let state = teststate::Emit::new(state, applied_tx); self.nodes.insert(id, Node::new(id, peers, log, state, node_tx, opts)?); self.nodes_rx.insert(id, node_rx); self.nodes_pending.insert(id, Vec::new()); self.applied_rx.insert(id, applied_rx); self.disconnected.insert(id, HashSet::new()); Ok(()) } /// Transitions nodes to candidates and campaign in a new term. fn campaign(&mut self, ids: &[NodeID], output: &mut String) -> Result<(), Box> { let campaign = |node| match node { Node::Candidate(mut node) => { node.campaign()?; Ok(node.into()) } Node::Follower(node) => Ok(node.into_candidate()?.into()), Node::Leader(node) => { let term = node.term(); Ok(node.into_follower(term + 1)?.into_candidate()?.into()) } }; for id in ids.iter().copied() { self.transition(id, campaign, output)?; } Ok(()) } /// Creates a Raft cluster. fn cluster( &mut self, nodes: u8, leader: Option, opts: Options, output: &mut String, ) -> Result<(), Box> { if !self.ids.is_empty() { return Err("cluster already exists".into()); } if nodes == 0 { return Err("cluster can't have 0 nodes".into()); } self.ids = (1..=nodes).collect(); for id in self.ids.clone() { let peers = self.ids.iter().copied().filter(|i| i != &id).collect(); self.add_node(id, peers, opts.clone())?; } // Promote leader if requested. Suppress output. if let Some(id) = leader { let quiet = &mut String::new(); let Some(Node::Follower(node)) = self.nodes.remove(&id) else { return Err(format!("invalid leader {id}").into()); }; self.nodes.insert(id, node.into_candidate()?.into_leader()?.into()); self.receive(id, quiet)?; self.stabilize(&self.ids.clone(), true, quiet)?; } // Drain any initial applied entries. for applied_rx in self.applied_rx.values_mut() { while applied_rx.try_recv().is_ok() {} } // Output final cluster status. self.status(&self.ids, output) } /// Delivers pending messages to the given nodes. If from is given, only /// delivers messages from that node. Returns the number of delivered /// messages. fn deliver( &mut self, ids: &[NodeID], from: Option, output: &mut String, ) -> Result> { // Take a snapshot of the pending queues before delivering any // messages. This avoids outbound messages in response to delivery // being delivered to higher node IDs in the same loop, which can // give unintuitive results. let mut step = Vec::new(); for id in ids.iter().copied() { let Some(pending) = self.nodes_pending.remove(&id) else { return Err(format!("unknown node {id}").into()); }; let (deliver, requeue) = pending.into_iter().partition(|msg| from.is_none() || from == Some(msg.from)); self.nodes_pending.insert(id, requeue); step.extend(deliver); } let delivered = step.len(); for msg in step { self.transition(msg.to, |node| node.step(msg), output)?; } Ok(delivered) } /// Heals the given partitioned nodes, restoring connectivity with all /// other nodes. fn heal(&mut self, ids: &[NodeID], output: &mut String) -> Result<(), Box> { for id in ids.iter().copied() { self.disconnected.insert(id, HashSet::new()); for peers in self.disconnected.values_mut() { peers.remove(&id); } } output.push_str(&Self::format_disconnected(&self.disconnected)); Ok(()) } /// Emits a heartbeat from the given leader nodes. fn heartbeat(&mut self, ids: &[NodeID], output: &mut String) -> Result<(), Box> { for id in ids.iter().copied() { let Some(Node::Leader(leader)) = self.nodes.get_mut(&id) else { return Err(format!("{id} is not a leader").into()); }; leader.heartbeat()?; self.receive(id, output)?; } Ok(()) } /// Outputs the current log contents for the given nodes. fn log(&mut self, ids: &[NodeID], output: &mut String) -> Result<(), Box> { for id in ids { let node = self.nodes.get_mut(id).ok_or(format!("unknown node {id}"))?; let nodefmt = Self::format_node(node); let (last_index, last_term) = node.get_last_index(); let (commit_index, commit_term) = node.get_commit_index(); let (term, vote) = node.get_term_vote(); writeln!( output, "{nodefmt} term={term} last={last_index}@{last_term} commit={commit_index}@{commit_term} vote={vote:?}", )?; for entry in node.scan_log()? { writeln!(output, "{nodefmt} entry {}", Self::format_entry(&entry))?; } } Ok(()) } /// Partitions the given nodes from all other nodes in the cluster /// (bidirectionally). The given nodes can communicate with each other /// unless they were previously partitioned. fn partition(&mut self, ids: &[NodeID], output: &mut String) -> Result<(), Box> { let ids = HashSet::::from_iter(ids.iter().copied()); for id in ids.iter().copied() { for peer in self.ids.iter().copied().filter(|p| !ids.contains(p)) { self.disconnected.entry(id).or_default().insert(peer); self.disconnected.entry(peer).or_default().insert(id); } } output.push_str(&Self::format_disconnected(&self.disconnected)); Ok(()) } /// Receives outbound messages from a node, prints them, and queues them /// for delivery. Returns the number of received messages. fn receive(&mut self, id: NodeID, output: &mut String) -> Result> { let rx = self.nodes_rx.get_mut(&id).ok_or(format!("unknown node {id}"))?; let mut count = 0; for msg in rx.try_iter() { count += 1; let (from, term, to) = (msg.from, msg.term, msg.to); // simplify formatting let msgfmt = Self::format_message(&msg.message); // If the peer is disconnected, drop the message and output it. if self.disconnected[&msg.from].contains(&msg.to) { writeln!( output, "n{from}@{term} ⇥ n{to} {}", Self::format_strikethrough(&msgfmt), )?; continue; } // Intercept and output client responses. if msg.from == msg.to { let Message::ClientResponse { id, response } = &msg.message else { return Err(format!("invalid self-addressed message: {msg:?}").into()); }; writeln!(output, "n{from}@{term} → c{to} {msgfmt}")?; let request = &self.requests.remove(id).ok_or("unknown request id")?; writeln!( output, "c{to}@{term} {} ⇒ {}", Self::format_request(request), Self::format_response(response), )?; continue; } // Output the message and queue it for delivery. writeln!(output, "n{from}@{term} → n{to} {msgfmt}")?; self.nodes_pending.get_mut(&msg.to).ok_or(format!("unknown node {to}"))?.push(msg); } Ok(count) } /// Submits a client request via the given node. fn request( &mut self, id: NodeID, request: Request, output: &mut String, ) -> Result<(), Box> { let request_id = Uuid::from_u64_pair(0, self.next_request_id); self.next_request_id += 1; self.requests.insert(request_id, request.clone()); let term = self.nodes.get(&id).ok_or(format!("unknown node {id}"))?.term(); let msg = Envelope { from: id, to: id, term, message: Message::ClientRequest { id: request_id, request }, }; writeln!(output, "c{id}@{term} → n{id} {}", Self::format_message(&msg.message))?; self.transition(id, |n| n.step(msg), output) } /// Restarts the given nodes. If commit_index or applied_index are /// given, the log commit index or state machine will regress. fn restart( &mut self, ids: &[NodeID], commit_index: Option, applied_index: Option, output: &mut String, ) -> Result<(), Box> { for id in ids.iter().copied() { let node = self.nodes.remove(&id).ok_or(format!("unknown node {id}"))?; let peers = node.peers(); let opts = node.options(); let (log, mut state) = node.dismantle(); let mut log = Log::new(log.engine)?; // reset log // If requested, regress the commit index. if let Some(commit_index) = commit_index { if commit_index > log.get_commit_index().0 { return Err(format!("commit_index={commit_index} beyond current").into()); } let commit_term = match log.get(commit_index)? { Some(e) => e.term, None if commit_index == 0 => 0, None => return Err(format!("unknown commit_index={commit_index}").into()), }; log.engine.set( &crate::raft::log::Key::CommitIndex.encode(), bincode::serialize(&(commit_index, commit_term)), )?; // Reset the log again. log = Log::new(log.engine)?; } // If requested, wipe the state machine and reapply up to the // requested applied index. if let Some(applied_index) = applied_index { if applied_index > log.get_commit_index().0 { return Err(format!("applied_index={applied_index} beyond commit").into()); } state = teststate::KV::new(); let mut scan = log.scan(..=applied_index); while let Some(entry) = scan.next().transpose()? { _ = state.apply(entry); // apply errors are returned to client } assert_eq!(state.get_applied_index(), applied_index, "wrong applied index"); } // Add node, and run a noop transition to output applied entries. self.add_node_with(id, peers, log, state, opts)?; self.transition(id, Ok, output)?; } // Output restarted node status. self.status(ids, output) } /// Stabilizes the given nodes by repeatedly delivering pending messages /// until no new messages are generated. If heartbeat is true, leaders /// then emit a heartbeat and restabilize again, e.g. to propagate the /// commit index. fn stabilize( &mut self, ids: &[NodeID], heartbeat: bool, output: &mut String, ) -> Result<(), Box> { while self.deliver(ids, None, output)? > 0 {} // If requested, heartbeat the current leader (with the highest // term) and re-stabilize the nodes. if heartbeat { let leader = self .nodes .values() .sorted_by_key(|n| n.term()) .rev() .find(|n| matches!(n, Node::Leader(_))); if let Some(leader) = leader { self.heartbeat(&[leader.id()], output)?; self.stabilize(ids, false, output)?; } } Ok(()) } /// Outputs the current state machine for the given nodes. fn state(&mut self, ids: &[NodeID], output: &mut String) -> Result<(), Box> { for id in ids { let node = self.nodes.get_mut(id).ok_or(format!("unknown node {id}"))?; let nodefmt = Self::format_node(node); let applied_index = node.get_applied_index(); let raw = node.read(KVCommand::Scan.encode())?; let KVResponse::Scan(kvs) = KVResponse::decode(&raw)? else { return Err("unexpected scan response".into()); }; writeln!(output, "{nodefmt} applied={applied_index}")?; for (key, value) in kvs { writeln!(output, "{nodefmt} state {key}={value}")?; } } Ok(()) } /// Outputs status for the given nodes. fn status(&self, ids: &[NodeID], output: &mut String) -> Result<(), Box> { for id in ids { let node = self.nodes.get(id).ok_or(format!("unknown node {id}"))?; let (last_index, last_term) = node.get_last_index(); let (commit_index, commit_term) = node.get_commit_index(); let applied_index = node.get_applied_index(); write!( output, "{node} last={last_index}@{last_term} commit={commit_index}@{commit_term} applied={applied_index}", node = Self::format_node_role(node) )?; if let Node::Leader(leader) = node { let progress = leader .role .progress .iter() .sorted_by_key(|(id, _)| *id) .map(|(id, pr)| format!("{id}:{}→{}", pr.match_index, pr.next_index)) .join(" "); write!(output, " progress={{{progress}}}")? } output.push('\n'); } Ok(()) } /// Applies a node transition (typically a step or tick), and outputs /// relevant changes. fn transition( &mut self, id: NodeID, f: impl FnOnce(Node) -> crate::error::Result, output: &mut String, ) -> Result<(), Box> { let mut node = self.nodes.remove(&id).ok_or(format!("unknown node {id}"))?; // Fetch pre-transition info. let old_noderole = Self::format_node_role(&node); let (old_commit_index, _) = node.get_commit_index(); let mut old_entries = node.scan_log()?.into_iter(); // Apply the transition. node = f(node)?; // Fetch post-transition info. let nodefmt = Self::format_node(&node); let noderole = Self::format_node_role(&node); let (commit_index, commit_term) = node.get_commit_index(); let entries = node.scan_log()?.into_iter(); let appended: Vec = entries .skip_while(|e| Some(e.term) == old_entries.next().map(|e| e.term)) .collect(); self.nodes.insert(id, node); // Output relevant changes. if old_noderole != noderole { writeln!(output, "{old_noderole} ⇨ {noderole}")? } for entry in appended { writeln!(output, "{nodefmt} append {}", Self::format_entry(&entry))? } if old_commit_index != commit_index { writeln!(output, "{nodefmt} commit {commit_index}@{commit_term}")?; } for entry in self.applied_rx[&id].try_iter() { writeln!(output, "{nodefmt} apply {}", Self::format_entry(&entry))? } // Receive any outbound messages. self.receive(id, output)?; Ok(()) } /// Parses node IDs from the given argument values. Errors on key/value /// arguments. Can take both [Argument] and [&Argument]. fn parse_ids(&self, args: &[A]) -> Result, Box> where A: Borrow, { let mut ids = Vec::new(); for arg in args.iter().map(|a| a.borrow()) { if let Some(key) = &arg.key { return Err(format!("unknown argument '{key}'").into()); } let id = arg.parse()?; if !self.nodes.contains_key(&id) { return Err(format!("unknown node {id}").into()); } ids.push(id) } Ok(ids) } // Parses node IDs from the given argument values, or returns all node // IDs if none were given. fn parse_ids_or_all(&self, args: &[A]) -> Result, Box> where A: Borrow, { let ids = self.parse_ids(args)?; if ids.is_empty() { return Ok(self.ids.clone()); } Ok(ids) } // Parses node IDs from the given argument values, or errors if none. fn parse_ids_or_error(&self, args: &[A]) -> Result, Box> where A: Borrow, { let ids = self.parse_ids(args)?; if ids.is_empty() { return Err("node ID not given".into()); } Ok(ids) } /// Formats network partitions. fn format_disconnected(disconnected: &HashMap>) -> String { // Return early if the cluster is fully connected. if disconnected.iter().all(|(_, peers)| peers.is_empty()) { return format!( "{} fully connected\n", disconnected.keys().sorted().map(|id| format!("n{id}")).join(" ") ); } let mut output = String::new(); // Separate symmetric and asymmetric partitions. let mut symmetric: HashMap> = HashMap::new(); let mut asymmetric: HashMap> = HashMap::new(); for (id, peers) in disconnected { for peer in peers { if disconnected[peer].contains(id) { symmetric.entry(*id).or_default().insert(*peer); } else { asymmetric.entry(*id).or_default().insert(*peer); } } } // Anchor the symmetric partitions at the node with the largest number // of disconnects, otherwise the smallest (first) ID. for (id, peers) in &symmetric.clone() { for peer in peers { // Recompute the peer set sizes for each iteration, since we // modify the peer set below. let len = symmetric.get(id).map(|p| p.len()).unwrap_or(0); let peer_len = symmetric.get(peer).map(|p| p.len()).unwrap_or(0); // If this peer set is the smallest (or we're the higher ID), // remove the entry. We may no longer be in the map. if (len < peer_len || len == peer_len && id > peer) && let Some(peers) = symmetric.get_mut(id) { peers.remove(peer); if peers.is_empty() { symmetric.remove(id); } } } } // The values (HashSets) correspond to the RHS of a partition. Let's // group the LHS of the partition as well, from smallest to largest, // separately for symmetric and asymmetric partitions. The vector // contains (LHS, RHS, symmetric) groupings for each partition. let mut grouped: Vec<(HashSet, HashSet, bool)> = Vec::new(); for (id, peers, symm) in symmetric .into_iter() .map(|(i, p)| (i, p, true)) .chain(asymmetric.into_iter().map(|(i, p)| (i, p, false))) .sorted_by_key(|(id, _, symm)| (*id, !symm)) { // Look for an existing LHS group with the same RHS, and insert // this node into it. Otherwise, create a new LHS group. match grouped.iter_mut().find(|(_, rhs, s)| peers == *rhs && symm == *s) { Some((lhs, _, _)) => _ = lhs.insert(id), None => grouped.push((HashSet::from([id]), peers, symm)), } } // Display the groups. for (lhs, rhs, symm) in grouped { let lhs = lhs.iter().sorted().map(|id| format!("n{id}")).join(" "); let sep = if symm { '⇹' } else { '⇥' }; let rhs = rhs.iter().sorted().map(|id| format!("n{id}")).join(" "); writeln!(output, "{lhs} {sep} {rhs}").unwrap(); } output } /// Formats an entry. fn format_entry(entry: &Entry) -> String { let command = match entry.command.as_ref() { Some(raw) => KVCommand::decode(raw).expect("invalid command").to_string(), None => "None".to_string(), }; format!("{index}@{term} {command}", index = entry.index, term = entry.term) } /// Formats a message. fn format_message(msg: &Message) -> String { match msg { Message::Campaign { last_index, last_term } => { format!("Campaign last={last_index}@{last_term}") } Message::CampaignResponse { vote } => { format!("CampaignResponse vote={vote}") } Message::Heartbeat { last_index, commit_index, read_seq } => { format!( "Heartbeat last_index={last_index} commit_index={commit_index} read_seq={read_seq}" ) } Message::HeartbeatResponse { match_index, read_seq } => { format!("HeartbeatResponse match_index={match_index} read_seq={read_seq}") } Message::Append { base_index, base_term, entries } => { let ent = entries.iter().map(|e| format!("{}@{}", e.index, e.term)).join(" "); format!("Append base={base_index}@{base_term} [{ent}]") } Message::AppendResponse { match_index, reject_index } => { match (match_index, reject_index) { (0, 0) => panic!("match_index and reject_index both 0"), (match_index, 0) => format!("AppendResponse match_index={match_index}"), (0, reject_index) => format!("AppendResponse reject_index={reject_index}"), (_, _) => panic!("match_index and reject_index both set"), } } Message::Read { seq } => { format!("Read seq={seq}") } Message::ReadResponse { seq } => { format!("ReadResponse seq={seq}") } Message::ClientRequest { id, request } => { format!( "ClientRequest id=0x{} {}", hex::encode(id).trim_start_matches("00"), match request { Request::Read(v) => format!("read 0x{}", hex::encode(v)), Request::Write(v) => format!("write 0x{}", hex::encode(v)), Request::Status => "status".to_string(), } ) } Message::ClientResponse { id, response } => { format!( "ClientResponse id=0x{} {}", hex::encode(id).trim_start_matches("00"), match response { Ok(Response::Read(v)) => format!("read 0x{}", hex::encode(v)), Ok(Response::Write(v)) => format!("write 0x{}", hex::encode(v)), Ok(Response::Status(v)) => format!("status {v:?}"), Err(error) => format!("Error::{error:#?}"), } ) } } } /// Formats a node identifier. fn format_node(node: &Node) -> String { format!("n{id}@{term}", id = node.id(), term = node.term()) } /// Formats a node identifier with role. fn format_node_role(node: &Node) -> String { let role = match node { Node::Candidate(_) => "candidate".to_string(), Node::Follower(node) => { let leader = node.role.leader.map(|id| format!("n{id}")).unwrap_or_default(); format!("follower({leader})") } Node::Leader(_) => "leader".to_string(), }; format!("{node} {role}", node = Self::format_node(node)) } /// Formats a request. fn format_request(request: &Request) -> String { match request { Request::Read(c) | Request::Write(c) => KVCommand::decode(c).unwrap().to_string(), Request::Status => "status".to_string(), } } /// Formats a response. fn format_response(response: &crate::error::Result) -> String { match response { Ok(Response::Read(r) | Response::Write(r)) => { KVResponse::decode(r).unwrap().to_string() } Ok(Response::Status(status)) => format!("{status:#?}"), Err(error) => format!("Error::{error:?} ({error})"), } } /// Strike-through formats the given string using a Unicode combining stroke. fn format_strikethrough(s: &str) -> String { s.chars().flat_map(|c| [c, '\u{0336}']).collect() } } } ================================================ FILE: src/raft/state.rs ================================================ use super::{Entry, Index}; use crate::error::Result; /// A Raft-managed state machine. Raft itself does not care what the state /// machine is, nor what the commands and results do -- it will simply apply /// arbitrary binary commands sequentially from the Raft log, returning an /// arbitrary binary result to the client. /// /// Since commands are applied identically across all nodes, they must be /// deterministic and yield the same state and result across all nodes too. /// Otherwise, the nodes will diverge, such that different nodes will produce /// different results. /// /// Write commands (`Request::Write`) are replicated and applied on all nodes /// via `State::apply`. The state machine must keep track of the last applied /// index and return it via `State::get_applied_index`. Read commands /// (`Request::Read`) are only executed on a single node via `State::read` and /// must not make any state changes. pub trait State: Send { /// Returns the last applied log index from the state machine. /// /// This must correspond to the current state of the state machine, since it /// determines which command to apply next. In particular, a node crash may /// result in partial command application or data loss, which must be /// handled appropriately. fn get_applied_index(&self) -> Index; /// Applies a log entry to the state machine, returning a client result. /// Errors are considered applied and propagated back to the client. /// /// This is executed on all nodes, so the result must be deterministic: it /// must yield the same state and result on all nodes, even if the command /// is reapplied following a node crash. /// /// Any non-deterministic apply error (e.g. an IO error) must panic and /// crash the node -- if it instead returns an error to the client, the /// command is considered applied and node states will diverge. The state /// machine is responsible for panicing when appropriate. /// /// The entry may contain a noop command, which is committed by Raft during /// leader changes. This still needs to be applied to the state machine to /// properly update the applied index, and should return an empty result. fn apply(&mut self, entry: Entry) -> Result>; /// Executes a read command in the state machine, returning a client result. /// Errors are also propagated back to the client. /// /// This is only executed on a single node, so it must not result in any /// state changes (i.e. it must not write). fn read(&self, command: Vec) -> Result>; } /// Test helper state machines. #[cfg(test)] pub mod test { use std::collections::BTreeMap; use std::fmt::Display; use crossbeam::channel::Sender; use itertools::Itertools as _; use serde::{Deserialize, Serialize}; use super::*; use crate::encoding::{self, Value as _}; /// Wraps a state machine and emits applied entries to the provided channel. pub struct Emit { inner: Box, tx: Sender, } impl Emit { pub fn new(inner: Box, tx: Sender) -> Box { Box::new(Self { inner, tx }) } } impl State for Emit { fn get_applied_index(&self) -> Index { self.inner.get_applied_index() } fn apply(&mut self, entry: Entry) -> Result> { let response = self.inner.apply(entry.clone())?; self.tx.send(entry)?; Ok(response) } fn read(&self, command: Vec) -> Result> { self.inner.read(command) } } /// A simple string key/value store. Takes KVCommands. pub struct KV { applied_index: Index, data: BTreeMap, } impl KV { pub fn new() -> Box { Box::new(Self { applied_index: 0, data: BTreeMap::new() }) } } impl State for KV { fn get_applied_index(&self) -> Index { self.applied_index } fn apply(&mut self, entry: Entry) -> Result> { let command = entry.command.as_deref().map(KVCommand::decode).transpose()?; let response = match command { Some(KVCommand::Put { key, value }) => { self.data.insert(key, value); KVResponse::Put(entry.index).encode() } Some(c @ (KVCommand::Get { .. } | KVCommand::Scan)) => { panic!("{c} submitted as write command") } None => Vec::new(), }; self.applied_index = entry.index; Ok(response) } fn read(&self, command: Vec) -> Result> { match KVCommand::decode(&command)? { KVCommand::Get { key } => { Ok(KVResponse::Get(self.data.get(&key).cloned()).encode()) } KVCommand::Scan => Ok(KVResponse::Scan(self.data.clone()).encode()), c @ KVCommand::Put { .. } => panic!("{c} submitted as read command"), } } } /// A KV command. Returns the corresponding KVResponse. #[derive(Serialize, Deserialize)] pub enum KVCommand { /// Fetches the value of the given key. Get { key: String }, /// Stores the given key/value pair, returning the applied index. Put { key: String, value: String }, /// Returns all key/value pairs. Scan, } impl encoding::Value for KVCommand {} impl Display for KVCommand { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Get { key } => write!(f, "get {key}"), Self::Put { key, value } => write!(f, "put {key}={value}"), Self::Scan => write!(f, "scan"), } } } /// A KVCommand response. #[derive(Serialize, Deserialize)] pub enum KVResponse { /// Get returns the key's value, or None if it does not exist. Get(Option), /// Put returns the applied index of the command. Put(Index), /// Scan returns the key/value pairs. Scan(BTreeMap), } impl encoding::Value for KVResponse {} impl Display for KVResponse { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Get(Some(value)) => write!(f, "{value}"), Self::Get(None) => write!(f, "None"), Self::Put(applied_index) => write!(f, "{applied_index}"), Self::Scan(kvs) => { write!(f, "{}", kvs.iter().map(|(k, v)| format!("{k}={v}")).join(",")) } } } } /// A state machine which does nothing. All commands are ignored. pub struct Noop { applied_index: Index, } impl Noop { pub fn new() -> Box { Box::new(Self { applied_index: 0 }) } } impl State for Noop { fn get_applied_index(&self) -> Index { self.applied_index } fn apply(&mut self, entry: Entry) -> Result> { self.applied_index = entry.index; Ok(Vec::new()) } fn read(&self, _: Vec) -> Result> { Ok(Vec::new()) } } } ================================================ FILE: src/raft/testscripts/log/append ================================================ # Appending an entry with term 0 fails. !append foo --- Panic: can't append entry in term 0 # Appending to an empty log works. The term doesn't have to be 1. The entry is # written to the engine and flushed to durable storage. set_term 2 append foo [ops] --- append → 1@2 "foo" engine set raft:Entry(1) → 1@2 "foo" ["\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x02\x01\x03foo"] engine flush # Appending a noop entry (no command) also works. append [ops] --- append → 2@2 None engine set raft:Entry(2) → 2@2 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x02\x02\x00"] engine flush # Check that the last index/term is updated (commit index isn't), and that # the engine contains the expected data, both in logical and raw form. status scan dump --- term=2 last=2@2 commit=0@0 vote=None 1@2 "foo" 2@2 None raft:Entry(1) → 1@2 "foo" ["\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x02\x01\x03foo"] raft:Entry(2) → 2@2 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x02\x02\x00"] raft:TermVote → term=2 vote=None ["\x01" → "\x02\x00"] # Skipping a term then appending is allowed. set_term 3 append command set_term 5 append --- append → 3@3 "command" append → 4@5 None # Dump the final status and data. status scan dump --- term=5 last=4@5 commit=0@0 vote=None 1@2 "foo" 2@2 None 3@3 "command" 4@5 None raft:Entry(1) → 1@2 "foo" ["\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x02\x01\x03foo"] raft:Entry(2) → 2@2 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x02\x02\x00"] raft:Entry(3) → 3@3 "command" ["\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x03\x03\x01\x07command"] raft:Entry(4) → 4@5 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x04\x05\x00"] raft:TermVote → term=5 vote=None ["\x01" → "\x05\x00"] ================================================ FILE: src/raft/testscripts/log/commit ================================================ # Committing fails on an empty engine. !commit 1 --- Panic: commit index 1 does not exist # Add some entries. set_term 2 splice 1@1= 2@1=foo 3@2=bar --- splice → 3@2 "bar" # Committing entry 0 fails. !commit 0 --- Panic: commit index 0 does not exist # Committing entry 1 works, and updates the commit index. # # Show the engine operations too, and notice that the commit index isn't flushed # to durable storage (it can be recovered from the durable quorum logs). commit 1 [ops] status --- commit → 1@1 None engine set raft:CommitIndex → 1@1 ["\x02" → "\x01\x01"] term=2 last=3@2 commit=1@1 vote=None # Dump the raw engine contents. dump --- raft:Entry(1) → 1@1 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x01\x00"] raft:Entry(2) → 2@1 "foo" ["\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x02\x01\x01\x03foo"] raft:Entry(3) → 3@2 "bar" ["\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x03\x02\x01\x03bar"] raft:TermVote → term=2 vote=None ["\x01" → "\x02\x00"] raft:CommitIndex → 1@1 ["\x02" → "\x01\x01"] # Commits are idempotent, which doesn't incur an engine set. commit 1 [ops] status --- commit → 1@1 None term=2 last=3@2 commit=1@1 vote=None # Commits can skip an entry. commit 3 status --- commit → 3@2 "bar" term=2 last=3@2 commit=3@2 vote=None # Commit regressions error. !commit 2 status --- Panic: commit index regression 3 → 2 term=2 last=3@2 commit=3@2 vote=None # Committing non-existant indexes error. !commit 4 status --- Panic: commit index 4 does not exist term=2 last=3@2 commit=3@2 vote=None # Dump the raw values. dump --- raft:Entry(1) → 1@1 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x01\x00"] raft:Entry(2) → 2@1 "foo" ["\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x02\x01\x01\x03foo"] raft:Entry(3) → 3@2 "bar" ["\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x03\x02\x01\x03bar"] raft:TermVote → term=2 vote=None ["\x01" → "\x02\x00"] raft:CommitIndex → 3@2 ["\x02" → "\x03\x02"] ================================================ FILE: src/raft/testscripts/log/get ================================================ # get returns None on an empty engine. get 1 --- None # Append a few entries. set_term 1 append append foo set_term 2 append bar --- append → 1@1 None append → 2@1 "foo" append → 3@2 "bar" # get returns noop entries and regular entries. get 1 2 --- 1@1 None 2@1 "foo" # get returns None for missing entries, and for index 0. get 4 0 --- None None ================================================ FILE: src/raft/testscripts/log/has ================================================ # has returns false on an empty engine. has 1@1 --- false # Append a few entries. set_term 1 append append foo set_term 2 append bar --- append → 1@1 None append → 2@1 "foo" append → 3@2 "bar" # has returns true both for noop entries and regular entries. has 1@1 2@1 --- true true # has returns false for missing entries, including index 0. has 4@2 0@0 --- false false # has returns false for term mismatches. has 1@2 3@1 0@1 --- false false false ================================================ FILE: src/raft/testscripts/log/init ================================================ # Tests that the log correctly initializes cached state when opened. set_term 1 --- ok append foo set_term 2 7 append bar commit 1 --- append → 1@1 "foo" append → 2@2 "bar" commit → 1@1 "foo" status --- term=2 last=2@2 commit=1@1 vote=7 reload --- ok status --- term=2 last=2@2 commit=1@1 vote=7 scan --- 1@1 "foo" 2@2 "bar" ================================================ FILE: src/raft/testscripts/log/scan ================================================ # scan works on an empty engine, even when given indexes. scan scan 3..7 --- ok # Append a few entries. set_term 1 append append foo set_term 2 append bar --- append → 1@1 None append → 2@1 "foo" append → 3@2 "bar" # Full scan. scan --- 1@1 None 2@1 "foo" 3@2 "bar" # Start bound. scan 2.. --- 2@1 "foo" 3@2 "bar" scan 4.. --- ok scan 0.. --- 1@1 None 2@1 "foo" 3@2 "bar" # End bound. scan "..2" --- 1@1 None scan "..=2" --- 1@1 None 2@1 "foo" scan "..7" --- 1@1 None 2@1 "foo" 3@2 "bar" scan "..1" --- ok scan "..0" --- ok # Both bounds. scan 1..2 --- 1@1 None scan "1..=2" --- 1@1 None 2@1 "foo" scan 0..7 --- 1@1 None 2@1 "foo" 3@2 "bar" scan 1..1 --- ok # Bounds panics. !scan 1..0 --- Panic: range start is greater than range end in BTreeMap !scan 7..3 --- Panic: range start is greater than range end in BTreeMap ================================================ FILE: src/raft/testscripts/log/scan_apply ================================================ # scan_apply works on an empty engine, even when given an applied index. scan_apply 0 scan_apply 3 --- ok # Append a few entries. set_term 1 append append foo set_term 2 append bar --- append → 1@1 None append → 2@1 "foo" append → 3@2 "bar" # Nothing is committed, so scan_applied yields nothing. scan_apply 0 --- ok # Commit the first two entries and apply them. commit 2 scan_apply 0 --- commit → 2@1 "foo" 1@1 None 2@1 "foo" # Passing the commit index yields nothing. scan_apply 2 --- ok # Passing an applied_index after the commit index is ok, and yields nothing. scan_apply 3 scan_apply 10 --- ok # Committing and applying the last entry works. commit 3 scan_apply 2 --- commit → 3@2 "bar" 3@2 "bar" # Scanning from a lower commit index again works. scan_apply 1 --- 2@1 "foo" 3@2 "bar" scan_apply 0 --- 1@1 None 2@1 "foo" 3@2 "bar" ================================================ FILE: src/raft/testscripts/log/splice ================================================ # Splicing at index 0 should fail. !splice 0@1=foo --- Panic: spliced entry has index or term 0 # Splicing without a term should fail. !splice 1@1=foo --- Panic: splice term 1 beyond current 0 # Splicing at index 2 should fail (creates gap). set_term 1 !splice 2@1=foo --- Panic: first index 2 must touch existing log # Splicing entries at start should work, both with and without commands, and # starting at a term after 1. They should be written to the engine and flushed # to durable storage. It should also update the state. set_term 2 splice 1@2= 2@2=command [ops] status scan --- splice → 2@2 "command" engine set raft:Entry(1) → 1@2 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x02\x00"] engine set raft:Entry(2) → 2@2 "command" ["\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x02\x02\x01\x07command"] engine flush term=2 last=2@2 commit=0@0 vote=None 1@2 None 2@2 "command" # Splicing an empty list should work and be a noop. splice [ops] status scan --- splice → 2@2 "command" term=2 last=2@2 commit=0@0 vote=None 1@2 None 2@2 "command" # Splicing multiple duplicate entries should fail. !splice 3@2= 3@2= --- Panic: spliced entries are not contiguous # Splicing entries with a gap should fail. !splice 3@2= 5@2= --- Panic: spliced entries are not contiguous # Splicing entries with a term regression should fail. !splice 3@2= 4@1= --- Panic: spliced entries have term regression # Splicing entries with a gap from the base should fail. !splice 4@2= --- Panic: first index 4 must touch existing log # Splicing with a term regression from the base should fail. !splice 3@1= --- Panic: splice term regression 2 → 1 # Splicing with a term beyond the current term should fail. !splice 3@3= !splice 3@4= --- Panic: splice term 3 beyond current 2 Panic: splice term 4 beyond current 2 # Fully overlapping entries is a noop. splice 1@2= 2@2=command [ops] scan --- splice → 2@2 "command" 1@2 None 2@2 "command" # An overlapping prefix is a noop. splice 1@2= [ops] scan --- splice → 2@2 "command" 1@2 None 2@2 "command" # An overlapping suffix is a noop. splice 2@2=command [ops] scan --- splice → 2@2 "command" 1@2 None 2@2 "command" # Changing a command with the same term/index should fail. !splice 2@2=foo scan --- Panic: command mismatch at Entry { index: 2, term: 2, command: Some([99, 111, 109, 109, 97, 110, 100]) } 1@2 None 2@2 "command" # Appending a new entry in the same term should work, as should # appending one in a new term. splice 3@2=bar set_term 3 splice 4@3= scan --- splice → 3@2 "bar" splice → 4@3 None 1@2 None 2@2 "command" 3@2 "bar" 4@3 None # Splicing with suffix overlap should work, and only write the new entries. splice 3@2=bar 4@3= 5@3=foo 6@3=bar [ops] scan --- splice → 6@3 "bar" engine set raft:Entry(5) → 5@3 "foo" ["\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x05\x03\x01\x03foo"] engine set raft:Entry(6) → 6@3 "bar" ["\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x06\x03\x01\x03bar"] engine flush 1@2 None 2@2 "command" 3@2 "bar" 4@3 None 5@3 "foo" 6@3 "bar" # Splicing at an existing index with a new term should replace the tail. set_term 4 splice 4@4= [ops] status scan --- splice → 4@4 None engine set raft:Entry(4) → 4@4 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x04\x04\x00"] engine delete raft:Entry(5) ["\x00\x00\x00\x00\x00\x00\x00\x00\x05"] engine delete raft:Entry(6) ["\x00\x00\x00\x00\x00\x00\x00\x00\x06"] engine flush term=4 last=4@4 commit=0@0 vote=None 1@2 None 2@2 "command" 3@2 "bar" 4@4 None # This also holds at the start of the log. set_term 5 splice 1@5= 2@5=foo 3@5=bar [ops] status scan --- splice → 3@5 "bar" engine set raft:Entry(1) → 1@5 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x05\x00"] engine set raft:Entry(2) → 2@5 "foo" ["\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x02\x05\x01\x03foo"] engine set raft:Entry(3) → 3@5 "bar" ["\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x03\x05\x01\x03bar"] engine delete raft:Entry(4) ["\x00\x00\x00\x00\x00\x00\x00\x00\x04"] engine flush term=5 last=3@5 commit=0@0 vote=None 1@5 None 2@5 "foo" 3@5 "bar" # Splicing across the commit index should work, as long as the entries match. commit 2 splice 1@5= 2@5=foo 3@5=bar 4@5= status scan --- commit → 2@5 "foo" splice → 4@5 None term=5 last=4@5 commit=2@5 vote=None 1@5 None 2@5 "foo" 3@5 "bar" 4@5 None # Splicing across the commit index can replace a tail after the commit index. set_term 9 splice 3@6= 4@6=bar status scan --- splice → 4@6 "bar" term=9 last=4@6 commit=2@5 vote=None 1@5 None 2@5 "foo" 3@6 None 4@6 "bar" # But replacing a tail at or before the commit index should fail. !splice 2@7= !splice 1@7= --- Panic: spliced entries below commit index Panic: spliced entries below commit index # Dump the raw data. dump --- raft:Entry(1) → 1@5 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x05\x00"] raft:Entry(2) → 2@5 "foo" ["\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x02\x05\x01\x03foo"] raft:Entry(3) → 3@6 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x03\x06\x00"] raft:Entry(4) → 4@6 "bar" ["\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x04\x06\x01\x03bar"] raft:TermVote → term=9 vote=None ["\x01" → "\t\x00"] raft:CommitIndex → 2@5 ["\x02" → "\x02\x05"] ================================================ FILE: src/raft/testscripts/log/status ================================================ # Status on empty engine works. status engine=true --- term=0 last=0@0 commit=0@0 vote=None engine=Status { name: "bitcask", keys: 0, size: 0, disk_size: 0, live_disk_size: 0, } # Write some data. set_term 1 append append foo set_term 2 1 append bar commit 2 --- append → 1@1 None append → 2@1 "foo" append → 3@2 "bar" commit → 2@1 "foo" # Status gives correct info. status engine=true --- term=2 last=3@2 commit=2@1 vote=1 engine=Status { name: "bitcask", keys: 5, size: 51, disk_size: 102, live_disk_size: 91, } ================================================ FILE: src/raft/testscripts/log/term ================================================ # get_term works on empty engine. get_term --- term=0 vote=None # Storing a 0 term errors. !set_term 0 --- Panic: can't set term 0 # set_term stores a term and empty vote, writing it to the engine # and flushing it to durable storage. set_term 3 [ops] get_term --- engine set raft:TermVote → term=3 vote=None ["\x01" → "\x03\x00"] engine flush term=3 vote=None # set_term stores a term and vote. set_term 3 7 [ops] get_term --- engine set raft:TermVote → term=3 vote=7 ["\x01" → "\x03\x01\x07"] engine flush term=3 vote=7 # set_term is idempotent, which doesn't incur an engine write. set_term 3 7 [ops] get_term --- term=3 vote=7 # Moving the term into the far future is allowed. set_term 7 get_term --- term=7 vote=None # Starting a new term with a vote is allowed. set_term 9 1 get_term --- term=9 vote=1 # Regressing the term errors. !set_term 8 --- Panic: term regression 9 → 8 # Clearing the vote errors. !set_term 9 --- Panic: can't change vote # Changing the vote errors. !set_term 9 2 --- Panic: can't change vote # The above errors should not have changed the term/vote. get_term dump --- term=9 vote=1 raft:TermVote → term=9 vote=1 ["\x01" → "\t\x01\x01"] ================================================ FILE: src/raft/testscripts/node/append ================================================ # Can append single entries in steady state. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Propose a single write. put 1 foo=bar --- c1@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172 n1@1 append 2@1 put foo=bar n1@1 → n2 Append base=1@1 [2@1] n1@1 → n3 Append base=1@1 [2@1] status --- n1@1 leader last=2@1 commit=1@1 applied=1 progress={2:1→3 3:1→3} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Append it to both followers. deliver --- n2@1 append 2@1 put foo=bar n2@1 → n1 AppendResponse match_index=2 n3@1 append 2@1 put foo=bar n3@1 → n1 AppendResponse match_index=2 # The leader commits and applies the write. stabilize --- n1@1 commit 2@1 n1@1 apply 2@1 put foo=bar n1@1 → c1 ClientResponse id=0x01 write 0x0102 c1@1 put foo=bar ⇒ 2 status --- n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:2→3} n2@1 follower(n1) last=2@1 commit=1@1 applied=1 n3@1 follower(n1) last=2@1 commit=1@1 applied=1 ================================================ FILE: src/raft/testscripts/node/append_base_missing ================================================ # Appends with a base beyond the node's last log entry should result in a # rejection at the index following the last entry, and the leader appending # the tail of the log. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Partition n3 so that it does not receive writes. partition 3 --- n3 ⇹ n1 n2 # Replicate a couple of writes. (put 1 a=1) (put 1 b=2) (put 1 c=3) (stabilize heartbeat=true) status --- n1@1 leader last=4@1 commit=4@1 applied=4 progress={2:4→5 3:1→5} n2@1 follower(n1) last=4@1 commit=4@1 applied=4 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Heal the partition, and propose another write. heal put 1 c=3 --- n1 n2 n3 fully connected c1@1 → n1 ClientRequest id=0x04 write 0x0101630133 n1@1 append 5@1 put c=3 n1@1 → n2 Append base=4@1 [5@1] n1@1 → n3 Append base=4@1 [5@1] # The 4@1 base is beyond n3's last index 1@1, so the append is rejected. # However, the follower returns reject_index=2 immediately after its # last index, rather than the original base index 4. deliver 3 --- n3@1 → n1 AppendResponse reject_index=2 # Because index 1 is already matched with the leader, it doesn't have to probe # and simply sends the entire tail, which is accepted. deliver 1 status 1 --- n1@1 → n3 Append base=1@1 [2@1 3@1 4@1 5@1] n1@1 leader last=5@1 commit=4@1 applied=4 progress={2:4→6 3:1→6} deliver 3 --- n3@1 append 2@1 put a=1 n3@1 append 3@1 put b=2 n3@1 append 4@1 put c=3 n3@1 append 5@1 put c=3 n3@1 → n1 AppendResponse match_index=5 # When n1 receives the ack, it commits and applies the write. deliver 1 --- n1@1 commit 5@1 n1@1 apply 5@1 put c=3 n1@1 → c1 ClientResponse id=0x04 write 0x0105 c1@1 put c=3 ⇒ 5 # The progress is also updated. status --- n1@1 leader last=5@1 commit=5@1 applied=5 progress={2:4→6 3:5→6} n2@1 follower(n1) last=4@1 commit=4@1 applied=4 n3@1 follower(n1) last=5@1 commit=1@1 applied=1 ================================================ FILE: src/raft/testscripts/node/append_base_missing_all ================================================ # Appends to a node with an empty log should result in a rejection of index 1, # allowing the leader to send the entire log. cluster nodes=3 --- n1@0 follower() last=0@0 commit=0@0 applied=0 n2@0 follower() last=0@0 commit=0@0 applied=0 n3@0 follower() last=0@0 commit=0@0 applied=0 # Partition n3 so that it does not receive writes. partition 3 --- n3 ⇹ n1 n2 # Elect n1 as leader. (campaign 1) (stabilize) status --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:0→2} n2@1 follower(n1) last=1@1 commit=0@0 applied=0 n3@0 follower() last=0@0 commit=0@0 applied=0 # Replicate a couple of writes. (put 1 a=1) (put 1 b=2) (put 1 c=3) (stabilize heartbeat=true) status --- n1@1 leader last=4@1 commit=4@1 applied=4 progress={2:4→5 3:0→5} n2@1 follower(n1) last=4@1 commit=4@1 applied=4 n3@0 follower() last=0@0 commit=0@0 applied=0 # Heal the partition, and propose another write. heal put 1 c=3 --- n1 n2 n3 fully connected c1@1 → n1 ClientRequest id=0x04 write 0x0101630133 n1@1 append 5@1 put c=3 n1@1 → n2 Append base=4@1 [5@1] n1@1 → n3 Append base=4@1 [5@1] # n3 has no entries, so it rejects with reject_index=1. deliver 3 --- n3@0 follower() ⇨ n3@1 follower(n1) n3@1 → n1 AppendResponse reject_index=1 # This allows n1 to send the entire log, without having to probe. deliver 1 status 1 --- n1@1 → n3 Append base=0@0 [1@1 2@1 3@1 4@1 5@1] n1@1 leader last=5@1 commit=4@1 applied=4 progress={2:4→6 3:0→6} deliver 3 --- n3@1 append 1@1 None n3@1 append 2@1 put a=1 n3@1 append 3@1 put b=2 n3@1 append 4@1 put c=3 n3@1 append 5@1 put c=3 n3@1 → n1 AppendResponse match_index=5 # When n1 receives the ack, it commits and applies the write. deliver 1 --- n1@1 commit 5@1 n1@1 apply 5@1 put c=3 n1@1 → c1 ClientResponse id=0x04 write 0x0105 c1@1 put c=3 ⇒ 5 # The progress is also updated. status --- n1@1 leader last=5@1 commit=5@1 applied=5 progress={2:4→6 3:5→6} n2@1 follower(n1) last=4@1 commit=4@1 applied=4 n3@1 follower(n1) last=5@1 commit=0@0 applied=0 ================================================ FILE: src/raft/testscripts/node/append_commit_quorum ================================================ # Append results in a leader-side commit once a quorum is reached for the # relevant entries. cluster nodes=6 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2 6:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 n4@1 follower(n1) last=1@1 commit=1@1 applied=1 n5@1 follower(n1) last=1@1 commit=1@1 applied=1 n6@1 follower(n1) last=1@1 commit=1@1 applied=1 # Incrementally disconnect all nodes except one and then propose a write, to # generate an increasing quorum index. # Replicating 2 to n2 does not commit. partition 3 4 5 6 --- n1 n2 ⇹ n3 n4 n5 n6 put 1 a=1 stabilize --- c1@1 → n1 ClientRequest id=0x01 write 0x0101610131 n1@1 append 2@1 put a=1 n1@1 → n2 Append base=1@1 [2@1] n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶ n1@1 ⇥ n4 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶ n1@1 ⇥ n5 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶ n1@1 ⇥ n6 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶ n2@1 append 2@1 put a=1 n2@1 → n1 AppendResponse match_index=2 status --- n1@1 leader last=2@1 commit=1@1 applied=1 progress={2:2→3 3:1→3 4:1→3 5:1→3 6:1→3} n2@1 follower(n1) last=2@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 n4@1 follower(n1) last=1@1 commit=1@1 applied=1 n5@1 follower(n1) last=1@1 commit=1@1 applied=1 n6@1 follower(n1) last=1@1 commit=1@1 applied=1 # Replicating 2-3 to n3 does not commit. heal partition 2 4 5 6 --- n1 n2 n3 n4 n5 n6 fully connected n1 n3 ⇹ n2 n4 n5 n6 put 1 b=2 stabilize --- c1@1 → n1 ClientRequest id=0x02 write 0x0101620132 n1@1 append 3@1 put b=2 n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶2̶@̶1̶ ̶[̶3̶@̶1̶]̶ n1@1 → n3 Append base=2@1 [3@1] n1@1 ⇥ n4 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶2̶@̶1̶ ̶[̶3̶@̶1̶]̶ n1@1 ⇥ n5 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶2̶@̶1̶ ̶[̶3̶@̶1̶]̶ n1@1 ⇥ n6 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶2̶@̶1̶ ̶[̶3̶@̶1̶]̶ n3@1 → n1 AppendResponse reject_index=2 n1@1 → n3 Append base=1@1 [2@1 3@1] n3@1 append 2@1 put a=1 n3@1 append 3@1 put b=2 n3@1 → n1 AppendResponse match_index=3 status --- n1@1 leader last=3@1 commit=1@1 applied=1 progress={2:2→4 3:3→4 4:1→4 5:1→4 6:1→4} n2@1 follower(n1) last=2@1 commit=1@1 applied=1 n3@1 follower(n1) last=3@1 commit=1@1 applied=1 n4@1 follower(n1) last=1@1 commit=1@1 applied=1 n5@1 follower(n1) last=1@1 commit=1@1 applied=1 n6@1 follower(n1) last=1@1 commit=1@1 applied=1 # Replicating 2-4 to n4 commits 2. heal partition 2 3 5 6 --- n1 n2 n3 n4 n5 n6 fully connected n1 n4 ⇹ n2 n3 n5 n6 put 1 c=3 stabilize --- c1@1 → n1 ClientRequest id=0x03 write 0x0101630133 n1@1 append 4@1 put c=3 n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶3̶@̶1̶ ̶[̶4̶@̶1̶]̶ n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶3̶@̶1̶ ̶[̶4̶@̶1̶]̶ n1@1 → n4 Append base=3@1 [4@1] n1@1 ⇥ n5 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶3̶@̶1̶ ̶[̶4̶@̶1̶]̶ n1@1 ⇥ n6 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶3̶@̶1̶ ̶[̶4̶@̶1̶]̶ n4@1 → n1 AppendResponse reject_index=2 n1@1 → n4 Append base=1@1 [2@1 3@1 4@1] n4@1 append 2@1 put a=1 n4@1 append 3@1 put b=2 n4@1 append 4@1 put c=3 n4@1 → n1 AppendResponse match_index=4 n1@1 commit 2@1 n1@1 apply 2@1 put a=1 n1@1 → c1 ClientResponse id=0x01 write 0x0102 c1@1 put a=1 ⇒ 2 status --- n1@1 leader last=4@1 commit=2@1 applied=2 progress={2:2→5 3:3→5 4:4→5 5:1→5 6:1→5} n2@1 follower(n1) last=2@1 commit=1@1 applied=1 n3@1 follower(n1) last=3@1 commit=1@1 applied=1 n4@1 follower(n1) last=4@1 commit=1@1 applied=1 n5@1 follower(n1) last=1@1 commit=1@1 applied=1 n6@1 follower(n1) last=1@1 commit=1@1 applied=1 # Replicating 2-5 to n5 commits 3. heal partition 2 3 4 6 --- n1 n2 n3 n4 n5 n6 fully connected n1 n5 ⇹ n2 n3 n4 n6 put 1 d=4 stabilize --- c1@1 → n1 ClientRequest id=0x04 write 0x0101640134 n1@1 append 5@1 put d=4 n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶4̶@̶1̶ ̶[̶5̶@̶1̶]̶ n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶4̶@̶1̶ ̶[̶5̶@̶1̶]̶ n1@1 ⇥ n4 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶4̶@̶1̶ ̶[̶5̶@̶1̶]̶ n1@1 → n5 Append base=4@1 [5@1] n1@1 ⇥ n6 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶4̶@̶1̶ ̶[̶5̶@̶1̶]̶ n5@1 → n1 AppendResponse reject_index=2 n1@1 → n5 Append base=1@1 [2@1 3@1 4@1 5@1] n5@1 append 2@1 put a=1 n5@1 append 3@1 put b=2 n5@1 append 4@1 put c=3 n5@1 append 5@1 put d=4 n5@1 → n1 AppendResponse match_index=5 n1@1 commit 3@1 n1@1 apply 3@1 put b=2 n1@1 → c1 ClientResponse id=0x02 write 0x0103 c1@1 put b=2 ⇒ 3 status --- n1@1 leader last=5@1 commit=3@1 applied=3 progress={2:2→6 3:3→6 4:4→6 5:5→6 6:1→6} n2@1 follower(n1) last=2@1 commit=1@1 applied=1 n3@1 follower(n1) last=3@1 commit=1@1 applied=1 n4@1 follower(n1) last=4@1 commit=1@1 applied=1 n5@1 follower(n1) last=5@1 commit=1@1 applied=1 n6@1 follower(n1) last=1@1 commit=1@1 applied=1 # Replicating 2-6 to n6 commits 4. heal partition 2 3 4 5 --- n1 n2 n3 n4 n5 n6 fully connected n1 n6 ⇹ n2 n3 n4 n5 put 1 e=5 stabilize --- c1@1 → n1 ClientRequest id=0x05 write 0x0101650135 n1@1 append 6@1 put e=5 n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶5̶@̶1̶ ̶[̶6̶@̶1̶]̶ n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶5̶@̶1̶ ̶[̶6̶@̶1̶]̶ n1@1 ⇥ n4 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶5̶@̶1̶ ̶[̶6̶@̶1̶]̶ n1@1 ⇥ n5 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶5̶@̶1̶ ̶[̶6̶@̶1̶]̶ n1@1 → n6 Append base=5@1 [6@1] n6@1 → n1 AppendResponse reject_index=2 n1@1 → n6 Append base=1@1 [2@1 3@1 4@1 5@1 6@1] n6@1 append 2@1 put a=1 n6@1 append 3@1 put b=2 n6@1 append 4@1 put c=3 n6@1 append 5@1 put d=4 n6@1 append 6@1 put e=5 n6@1 → n1 AppendResponse match_index=6 n1@1 commit 4@1 n1@1 apply 4@1 put c=3 n1@1 → c1 ClientResponse id=0x03 write 0x0104 c1@1 put c=3 ⇒ 4 status --- n1@1 leader last=6@1 commit=4@1 applied=4 progress={2:2→7 3:3→7 4:4→7 5:5→7 6:6→7} n2@1 follower(n1) last=2@1 commit=1@1 applied=1 n3@1 follower(n1) last=3@1 commit=1@1 applied=1 n4@1 follower(n1) last=4@1 commit=1@1 applied=1 n5@1 follower(n1) last=5@1 commit=1@1 applied=1 n6@1 follower(n1) last=6@1 commit=1@1 applied=1 # Healing the partition and proposing another write replicates and commits all # entries. heal --- n1 n2 n3 n4 n5 n6 fully connected put 1 f=6 stabilize --- c1@1 → n1 ClientRequest id=0x06 write 0x0101660136 n1@1 append 7@1 put f=6 n1@1 → n2 Append base=6@1 [7@1] n1@1 → n3 Append base=6@1 [7@1] n1@1 → n4 Append base=6@1 [7@1] n1@1 → n5 Append base=6@1 [7@1] n1@1 → n6 Append base=6@1 [7@1] n2@1 → n1 AppendResponse reject_index=3 n3@1 → n1 AppendResponse reject_index=4 n4@1 → n1 AppendResponse reject_index=5 n5@1 → n1 AppendResponse reject_index=6 n6@1 append 7@1 put f=6 n6@1 → n1 AppendResponse match_index=7 n1@1 → n2 Append base=2@1 [3@1 4@1 5@1 6@1 7@1] n1@1 → n3 Append base=3@1 [4@1 5@1 6@1 7@1] n1@1 → n4 Append base=4@1 [5@1 6@1 7@1] n1@1 → n5 Append base=5@1 [6@1 7@1] n2@1 append 3@1 put b=2 n2@1 append 4@1 put c=3 n2@1 append 5@1 put d=4 n2@1 append 6@1 put e=5 n2@1 append 7@1 put f=6 n2@1 → n1 AppendResponse match_index=7 n3@1 append 4@1 put c=3 n3@1 append 5@1 put d=4 n3@1 append 6@1 put e=5 n3@1 append 7@1 put f=6 n3@1 → n1 AppendResponse match_index=7 n4@1 append 5@1 put d=4 n4@1 append 6@1 put e=5 n4@1 append 7@1 put f=6 n4@1 → n1 AppendResponse match_index=7 n5@1 append 6@1 put e=5 n5@1 append 7@1 put f=6 n5@1 → n1 AppendResponse match_index=7 n1@1 commit 5@1 n1@1 apply 5@1 put d=4 n1@1 → c1 ClientResponse id=0x04 write 0x0105 c1@1 put d=4 ⇒ 5 n1@1 commit 7@1 n1@1 apply 6@1 put e=5 n1@1 apply 7@1 put f=6 n1@1 → c1 ClientResponse id=0x05 write 0x0106 c1@1 put e=5 ⇒ 6 n1@1 → c1 ClientResponse id=0x06 write 0x0107 c1@1 put f=6 ⇒ 7 status --- n1@1 leader last=7@1 commit=7@1 applied=7 progress={2:7→8 3:7→8 4:7→8 5:7→8 6:7→8} n2@1 follower(n1) last=7@1 commit=1@1 applied=1 n3@1 follower(n1) last=7@1 commit=1@1 applied=1 n4@1 follower(n1) last=7@1 commit=1@1 applied=1 n5@1 follower(n1) last=7@1 commit=1@1 applied=1 n6@1 follower(n1) last=7@1 commit=1@1 applied=1 ================================================ FILE: src/raft/testscripts/node/append_initial ================================================ # An initial append at base 0 can have a single or multiple entries. cluster nodes=3 --- n1@0 follower() last=0@0 commit=0@0 applied=0 n2@0 follower() last=0@0 commit=0@0 applied=0 n3@0 follower() last=0@0 commit=0@0 applied=0 # Partition n3 so that is has an empty log. partition 3 --- n3 ⇹ n1 n2 # n1 campaigns. campaign 1 deliver --- n1@0 follower() ⇨ n1@1 candidate n1@1 → n2 Campaign last=0@0 n1@1 ⇥ n3 C̶a̶m̶p̶a̶i̶g̶n̶ ̶l̶a̶s̶t̶=̶0̶@̶0̶ n2@0 follower() ⇨ n2@1 follower() n2@1 → n1 CampaignResponse vote=true # When n1 wins, it successfully appends an entry at base 0 to n2. stabilize --- n1@1 candidate ⇨ n1@1 leader n1@1 append 1@1 None n1@1 → n2 Append base=0@0 [1@1] n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶0̶@̶0̶ ̶[̶1̶@̶1̶]̶ n1@1 → n2 Heartbeat last_index=1 commit_index=0 read_seq=0 n1@1 ⇥ n3 H̶e̶a̶r̶t̶b̶e̶a̶t̶ ̶l̶a̶s̶t̶_̶i̶n̶d̶e̶x̶=̶1̶ ̶c̶o̶m̶m̶i̶t̶_̶i̶n̶d̶e̶x̶=̶0̶ ̶r̶e̶a̶d̶_̶s̶e̶q̶=̶0̶ n2@1 follower() ⇨ n2@1 follower(n1) n2@1 append 1@1 None n2@1 → n1 AppendResponse match_index=1 n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0 n1@1 commit 1@1 n1@1 apply 1@1 None status --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:0→2} n2@1 follower(n1) last=1@1 commit=0@0 applied=0 n3@0 follower() last=0@0 commit=0@0 applied=0 # Heal the partition. heal --- n1 n2 n3 fully connected # Propose a write. This appends entry 2 to n2 at base 1, but is rejected by n3 # which doesn't have entry 1. put 1 foo=bar deliver --- c1@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172 n1@1 append 2@1 put foo=bar n1@1 → n2 Append base=1@1 [2@1] n1@1 → n3 Append base=1@1 [2@1] n2@1 append 2@1 put foo=bar n2@1 → n1 AppendResponse match_index=2 n3@0 follower() ⇨ n3@1 follower(n1) n3@1 → n1 AppendResponse reject_index=1 # Since n3 rejected base 1, n1 sends an append with all messages, which # is accepted. stabilize --- n1@1 commit 2@1 n1@1 apply 2@1 put foo=bar n1@1 → c1 ClientResponse id=0x01 write 0x0102 c1@1 put foo=bar ⇒ 2 n1@1 → n3 Append base=0@0 [1@1 2@1] n3@1 append 1@1 None n3@1 append 2@1 put foo=bar n3@1 → n1 AppendResponse match_index=2 log --- n1@1 term=1 last=2@1 commit=2@1 vote=Some(1) n1@1 entry 1@1 None n1@1 entry 2@1 put foo=bar n2@1 term=1 last=2@1 commit=0@0 vote=Some(1) n2@1 entry 1@1 None n2@1 entry 2@1 put foo=bar n3@1 term=1 last=2@1 commit=0@0 vote=None n3@1 entry 1@1 None n3@1 entry 2@1 put foo=bar ================================================ FILE: src/raft/testscripts/node/append_max_entries ================================================ # Large appends are limited to MAX_APPEND_ENTRIES, and each successful append # triggers the next append batch. cluster nodes=3 leader=1 max_append_entries=2 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Partition n3. partition 3 --- n3 ⇹ n1 n2 # Make a bunch of writes. (put 1 a=1) (put 1 a=2) (put 1 a=3) (put 1 a=4) (put 1 a=5) (put 1 a=6) (put 1 a=7) (stabilize heartbeat=true) status --- n1@1 leader last=8@1 commit=8@1 applied=8 progress={2:8→9 3:1→9} n2@1 follower(n1) last=8@1 commit=8@1 applied=8 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Heal the partition. heal --- n1 n2 n3 fully connected # The next heartbeat triggers a probe. heartbeat 1 deliver deliver deliver --- n1@1 → n2 Heartbeat last_index=8 commit_index=8 read_seq=0 n1@1 → n3 Heartbeat last_index=8 commit_index=8 read_seq=0 n2@1 → n1 HeartbeatResponse match_index=8 read_seq=0 n3@1 → n1 HeartbeatResponse match_index=0 read_seq=0 n1@1 → n3 Append base=7@1 [] n3@1 → n1 AppendResponse reject_index=2 # When the leader receives the probe response, it begins appending in batches of # max_append_entries until the follower is caught up. stabilize --- n1@1 → n3 Append base=1@1 [2@1 3@1] n3@1 append 2@1 put a=1 n3@1 append 3@1 put a=2 n3@1 → n1 AppendResponse match_index=3 n1@1 → n3 Append base=3@1 [4@1 5@1] n3@1 append 4@1 put a=3 n3@1 append 5@1 put a=4 n3@1 → n1 AppendResponse match_index=5 n1@1 → n3 Append base=5@1 [6@1 7@1] n3@1 append 6@1 put a=5 n3@1 append 7@1 put a=6 n3@1 → n1 AppendResponse match_index=7 n1@1 → n3 Append base=7@1 [8@1] n3@1 append 8@1 put a=7 n3@1 → n1 AppendResponse match_index=8 ================================================ FILE: src/raft/testscripts/node/append_pipeline ================================================ # Multiple appends are pipelined before acks are received, without # retransmitting the unacked entries. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Propose a single write. The progress next index increases to 3. put 1 a=1 --- c1@1 → n1 ClientRequest id=0x01 write 0x0101610131 n1@1 append 2@1 put a=1 n1@1 → n2 Append base=1@1 [2@1] n1@1 → n3 Append base=1@1 [2@1] status --- n1@1 leader last=2@1 commit=1@1 applied=1 progress={2:1→3 3:1→3} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Propose two more writes. Appends are sent without past duplicates. put 1 b=2 put 1 c=3 --- c1@1 → n1 ClientRequest id=0x02 write 0x0101620132 n1@1 append 3@1 put b=2 n1@1 → n2 Append base=2@1 [3@1] n1@1 → n3 Append base=2@1 [3@1] c1@1 → n1 ClientRequest id=0x03 write 0x0101630133 n1@1 append 4@1 put c=3 n1@1 → n2 Append base=3@1 [4@1] n1@1 → n3 Append base=3@1 [4@1] status --- n1@1 leader last=4@1 commit=1@1 applied=1 progress={2:1→5 3:1→5} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # The appends are received and acked sequentially. deliver --- n2@1 append 2@1 put a=1 n2@1 → n1 AppendResponse match_index=2 n2@1 append 3@1 put b=2 n2@1 → n1 AppendResponse match_index=3 n2@1 append 4@1 put c=3 n2@1 → n1 AppendResponse match_index=4 n3@1 append 2@1 put a=1 n3@1 → n1 AppendResponse match_index=2 n3@1 append 3@1 put b=2 n3@1 → n1 AppendResponse match_index=3 n3@1 append 4@1 put c=3 n3@1 → n1 AppendResponse match_index=4 # The leader receives the acks and commits the writes one by one, # without retransmitting the in-flight (to it) entries. deliver --- n1@1 commit 2@1 n1@1 apply 2@1 put a=1 n1@1 → c1 ClientResponse id=0x01 write 0x0102 c1@1 put a=1 ⇒ 2 n1@1 commit 3@1 n1@1 apply 3@1 put b=2 n1@1 → c1 ClientResponse id=0x02 write 0x0103 c1@1 put b=2 ⇒ 3 n1@1 commit 4@1 n1@1 apply 4@1 put c=3 n1@1 → c1 ClientResponse id=0x03 write 0x0104 c1@1 put c=3 ⇒ 4 # All nodes are now caught up on logs (but not commit/apply, which needs a # heartbeat). status --- n1@1 leader last=4@1 commit=4@1 applied=4 progress={2:4→5 3:4→5} n2@1 follower(n1) last=4@1 commit=1@1 applied=1 n3@1 follower(n1) last=4@1 commit=1@1 applied=1 ================================================ FILE: src/raft/testscripts/node/append_probe_divergent_first ================================================ # Appends to a previous leader and follower with a divergent tail all # the way back to the first entry works. cluster nodes=5 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 n4@1 follower(n1) last=1@1 commit=1@1 applied=1 n5@1 follower(n1) last=1@1 commit=1@1 applied=1 # Partition n1-n2 partition 1 2 --- n1 n2 ⇹ n3 n4 n5 # Elect new leaders in the majority partition and replicate a few writes. # Multiple leaders ensures the log has multiple terms. (campaign 3) (stabilize) (put 3 a=1) (stabilize heartbeat=true) status --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@2 leader last=3@2 commit=3@2 applied=3 progress={1:0→4 2:0→4 4:3→4 5:3→4} n4@2 follower(n3) last=3@2 commit=3@2 applied=3 n5@2 follower(n3) last=3@2 commit=3@2 applied=3 (campaign 4) (stabilize) (put 4 b=2) (stabilize heartbeat=true) status --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@3 follower(n4) last=5@3 commit=5@3 applied=5 n4@3 leader last=5@3 commit=5@3 applied=5 progress={1:0→6 2:0→6 3:5→6 5:5→6} n5@3 follower(n4) last=5@3 commit=5@3 applied=5 (campaign 5) (stabilize) (put 5 c=3) (stabilize heartbeat=true) status --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@4 follower(n5) last=7@4 commit=7@4 applied=7 n4@4 follower(n5) last=7@4 commit=7@4 applied=7 n5@4 leader last=7@4 commit=7@4 applied=7 progress={1:0→8 2:0→8 3:7→8 4:7→8} # Propose writes in the minority partition as well. (put 1 a=2) (put 1 a=3) (put 1 a=4) (put 1 a=5) (put 1 a=6) (put 1 a=7) (stabilize) status --- n1@1 leader last=7@1 commit=1@1 applied=1 progress={2:7→8 3:1→8 4:1→8 5:1→8} n2@1 follower(n1) last=7@1 commit=1@1 applied=1 n3@4 follower(n5) last=7@4 commit=7@4 applied=7 n4@4 follower(n5) last=7@4 commit=7@4 applied=7 n5@4 leader last=7@4 commit=7@4 applied=7 progress={1:0→8 2:0→8 3:7→8 4:7→8} log 1 5 --- n1@1 term=1 last=7@1 commit=1@1 vote=Some(1) n1@1 entry 1@1 None n1@1 entry 2@1 put a=2 n1@1 entry 3@1 put a=3 n1@1 entry 4@1 put a=4 n1@1 entry 5@1 put a=5 n1@1 entry 6@1 put a=6 n1@1 entry 7@1 put a=7 n5@4 term=4 last=7@4 commit=7@4 vote=Some(5) n5@4 entry 1@1 None n5@4 entry 2@2 None n5@4 entry 3@2 put a=1 n5@4 entry 4@3 None n5@4 entry 5@3 put b=2 n5@4 entry 6@4 None n5@4 entry 7@4 put c=3 # Heal the partition. heal --- n1 n2 n3 n4 n5 fully connected # Propose another write on the majority leader. put 5 d=4 --- c5@4 → n5 ClientRequest id=0x0a write 0x0101640134 n5@4 append 8@4 put d=4 n5@4 → n1 Append base=7@4 [8@4] n5@4 → n2 Append base=7@4 [8@4] n5@4 → n3 Append base=7@4 [8@4] n5@4 → n4 Append base=7@4 [8@4] # Delivering the appends to n1 and n2 should reject them. It also cancels the # in-flight write requests on n1. deliver 1 2 --- n1@1 leader ⇨ n1@4 follower(n5) n1@1 → c1 ClientResponse id=0x04 Error::Abort c1@1 put a=2 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x05 Error::Abort c1@1 put a=3 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x06 Error::Abort c1@1 put a=4 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x07 Error::Abort c1@1 put a=5 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x08 Error::Abort c1@1 put a=6 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x09 Error::Abort c1@1 put a=7 ⇒ Error::Abort (operation aborted) n1@4 → n5 AppendResponse reject_index=7 n2@1 follower(n1) ⇨ n2@4 follower(n5) n2@4 → n5 AppendResponse reject_index=7 # n5 will probe the previous base, which is again rejected. This repeats until # a common base is found at 1@1. deliver 5 status 5 deliver 1 2 --- n5@4 → n1 Append base=6@4 [] n5@4 → n2 Append base=6@4 [] n5@4 leader last=8@4 commit=7@4 applied=7 progress={1:0→7 2:0→7 3:7→9 4:7→9} n1@4 → n5 AppendResponse reject_index=6 n2@4 → n5 AppendResponse reject_index=6 deliver 5 deliver 1 2 status 5 --- n5@4 → n1 Append base=5@3 [] n5@4 → n2 Append base=5@3 [] n1@4 → n5 AppendResponse reject_index=5 n2@4 → n5 AppendResponse reject_index=5 n5@4 leader last=8@4 commit=7@4 applied=7 progress={1:0→6 2:0→6 3:7→9 4:7→9} deliver 5 status 5 deliver 1 2 --- n5@4 → n1 Append base=4@3 [] n5@4 → n2 Append base=4@3 [] n5@4 leader last=8@4 commit=7@4 applied=7 progress={1:0→5 2:0→5 3:7→9 4:7→9} n1@4 → n5 AppendResponse reject_index=4 n2@4 → n5 AppendResponse reject_index=4 deliver 5 status 5 deliver 1 2 --- n5@4 → n1 Append base=3@2 [] n5@4 → n2 Append base=3@2 [] n5@4 leader last=8@4 commit=7@4 applied=7 progress={1:0→4 2:0→4 3:7→9 4:7→9} n1@4 → n5 AppendResponse reject_index=3 n2@4 → n5 AppendResponse reject_index=3 deliver 5 status 5 deliver 1 2 --- n5@4 → n1 Append base=2@2 [] n5@4 → n2 Append base=2@2 [] n5@4 leader last=8@4 commit=7@4 applied=7 progress={1:0→3 2:0→3 3:7→9 4:7→9} n1@4 → n5 AppendResponse reject_index=2 n2@4 → n5 AppendResponse reject_index=2 deliver 5 status 5 deliver 1 2 --- n5@4 → n1 Append base=1@1 [] n5@4 → n2 Append base=1@1 [] n5@4 leader last=8@4 commit=7@4 applied=7 progress={1:0→2 2:0→2 3:7→9 4:7→9} n1@4 → n5 AppendResponse match_index=1 n2@4 → n5 AppendResponse match_index=1 # n5 can now replicate the tail to n1 and n2, allowing n5 to commit it. deliver 5 status 5 deliver 1 2 --- n5@4 → n1 Append base=1@1 [2@2 3@2 4@3 5@3 6@4 7@4 8@4] n5@4 → n2 Append base=1@1 [2@2 3@2 4@3 5@3 6@4 7@4 8@4] n5@4 leader last=8@4 commit=7@4 applied=7 progress={1:1→9 2:1→9 3:7→9 4:7→9} n1@4 append 2@2 None n1@4 append 3@2 put a=1 n1@4 append 4@3 None n1@4 append 5@3 put b=2 n1@4 append 6@4 None n1@4 append 7@4 put c=3 n1@4 append 8@4 put d=4 n1@4 → n5 AppendResponse match_index=8 n2@4 append 2@2 None n2@4 append 3@2 put a=1 n2@4 append 4@3 None n2@4 append 5@3 put b=2 n2@4 append 6@4 None n2@4 append 7@4 put c=3 n2@4 append 8@4 put d=4 n2@4 → n5 AppendResponse match_index=8 deliver 5 --- n5@4 commit 8@4 n5@4 apply 8@4 put d=4 n5@4 → c5 ClientResponse id=0x0a write 0x0108 c5@4 put d=4 ⇒ 8 status --- n1@4 follower(n5) last=8@4 commit=1@1 applied=1 n2@4 follower(n5) last=8@4 commit=1@1 applied=1 n3@4 follower(n5) last=7@4 commit=7@4 applied=7 n4@4 follower(n5) last=7@4 commit=7@4 applied=7 n5@4 leader last=8@4 commit=8@4 applied=8 progress={1:8→9 2:8→9 3:7→9 4:7→9} # Stabilize the cluster. (stabilize heartbeat=true) status --- n1@4 follower(n5) last=8@4 commit=8@4 applied=8 n2@4 follower(n5) last=8@4 commit=8@4 applied=8 n3@4 follower(n5) last=8@4 commit=8@4 applied=8 n4@4 follower(n5) last=8@4 commit=8@4 applied=8 n5@4 leader last=8@4 commit=8@4 applied=8 progress={1:8→9 2:8→9 3:8→9 4:8→9} ================================================ FILE: src/raft/testscripts/node/append_probe_divergent_long ================================================ # Appends to a previous leader and follower with a long divergent tail requires # the leader to repeatedly probe until it finds a common base. cluster nodes=5 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 n4@1 follower(n1) last=1@1 commit=1@1 applied=1 n5@1 follower(n1) last=1@1 commit=1@1 applied=1 # Make a couple of writes to ensure a common log prefix. (put 1 a=1) (put 1 b=2) (stabilize) status --- n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4 4:3→4 5:3→4} n2@1 follower(n1) last=3@1 commit=1@1 applied=1 n3@1 follower(n1) last=3@1 commit=1@1 applied=1 n4@1 follower(n1) last=3@1 commit=1@1 applied=1 n5@1 follower(n1) last=3@1 commit=1@1 applied=1 # Partition n1-n2 partition 1 2 --- n1 n2 ⇹ n3 n4 n5 # Elect new leaders in the majority partition and replicate a few writes. # Multiple leaders ensures the log has multiple terms. (campaign 3) (stabilize) (put 3 c=3) (stabilize heartbeat=true) status --- n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4 4:3→4 5:3→4} n2@1 follower(n1) last=3@1 commit=1@1 applied=1 n3@2 leader last=5@2 commit=5@2 applied=5 progress={1:0→6 2:0→6 4:5→6 5:5→6} n4@2 follower(n3) last=5@2 commit=5@2 applied=5 n5@2 follower(n3) last=5@2 commit=5@2 applied=5 (campaign 4) (stabilize) (put 4 d=4) (stabilize heartbeat=true) status --- n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4 4:3→4 5:3→4} n2@1 follower(n1) last=3@1 commit=1@1 applied=1 n3@3 follower(n4) last=7@3 commit=7@3 applied=7 n4@3 leader last=7@3 commit=7@3 applied=7 progress={1:0→8 2:0→8 3:7→8 5:7→8} n5@3 follower(n4) last=7@3 commit=7@3 applied=7 (campaign 5) (stabilize) (put 5 e=5) (stabilize heartbeat=true) status --- n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4 4:3→4 5:3→4} n2@1 follower(n1) last=3@1 commit=1@1 applied=1 n3@4 follower(n5) last=9@4 commit=9@4 applied=9 n4@4 follower(n5) last=9@4 commit=9@4 applied=9 n5@4 leader last=9@4 commit=9@4 applied=9 progress={1:0→10 2:0→10 3:9→10 4:9→10} # Propose writes in the minority partition as well, to build up a log # longer than the majority log. (put 1 a=2) (put 1 a=3) (put 1 a=4) (put 1 a=5) (put 1 a=6) (put 1 a=7) (put 1 a=8) (put 1 a=9) (put 1 a=10) (stabilize) status --- n1@1 leader last=12@1 commit=3@1 applied=3 progress={2:12→13 3:3→13 4:3→13 5:3→13} n2@1 follower(n1) last=12@1 commit=1@1 applied=1 n3@4 follower(n5) last=9@4 commit=9@4 applied=9 n4@4 follower(n5) last=9@4 commit=9@4 applied=9 n5@4 leader last=9@4 commit=9@4 applied=9 progress={1:0→10 2:0→10 3:9→10 4:9→10} log 1 5 --- n1@1 term=1 last=12@1 commit=3@1 vote=Some(1) n1@1 entry 1@1 None n1@1 entry 2@1 put a=1 n1@1 entry 3@1 put b=2 n1@1 entry 4@1 put a=2 n1@1 entry 5@1 put a=3 n1@1 entry 6@1 put a=4 n1@1 entry 7@1 put a=5 n1@1 entry 8@1 put a=6 n1@1 entry 9@1 put a=7 n1@1 entry 10@1 put a=8 n1@1 entry 11@1 put a=9 n1@1 entry 12@1 put a=10 n5@4 term=4 last=9@4 commit=9@4 vote=Some(5) n5@4 entry 1@1 None n5@4 entry 2@1 put a=1 n5@4 entry 3@1 put b=2 n5@4 entry 4@2 None n5@4 entry 5@2 put c=3 n5@4 entry 6@3 None n5@4 entry 7@3 put d=4 n5@4 entry 8@4 None n5@4 entry 9@4 put e=5 # Heal the partition. heal --- n1 n2 n3 n4 n5 fully connected # Propose another write on the majority leader. put 5 f=6 --- c5@4 → n5 ClientRequest id=0x0f write 0x0101660136 n5@4 append 10@4 put f=6 n5@4 → n1 Append base=9@4 [10@4] n5@4 → n2 Append base=9@4 [10@4] n5@4 → n3 Append base=9@4 [10@4] n5@4 → n4 Append base=9@4 [10@4] # Delivering the appends to n1 and n2 should reject them. It also cancels the # in-flight write requests on n1. deliver 1 2 --- n1@1 leader ⇨ n1@4 follower(n5) n1@1 → c1 ClientResponse id=0x06 Error::Abort c1@1 put a=2 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x07 Error::Abort c1@1 put a=3 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x08 Error::Abort c1@1 put a=4 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x09 Error::Abort c1@1 put a=5 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x0a Error::Abort c1@1 put a=6 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x0b Error::Abort c1@1 put a=7 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x0c Error::Abort c1@1 put a=8 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x0d Error::Abort c1@1 put a=9 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x0e Error::Abort c1@1 put a=10 ⇒ Error::Abort (operation aborted) n1@4 → n5 AppendResponse reject_index=9 n2@1 follower(n1) ⇨ n2@4 follower(n5) n2@4 → n5 AppendResponse reject_index=9 # n5 will probe the previous base, which is again rejected. This repeats until # a common base is found at 3@1. deliver 5 status 5 deliver 1 2 --- n5@4 → n1 Append base=8@4 [] n5@4 → n2 Append base=8@4 [] n5@4 leader last=10@4 commit=9@4 applied=9 progress={1:0→9 2:0→9 3:9→11 4:9→11} n1@4 → n5 AppendResponse reject_index=8 n2@4 → n5 AppendResponse reject_index=8 deliver 5 deliver 1 2 status 5 --- n5@4 → n1 Append base=7@3 [] n5@4 → n2 Append base=7@3 [] n1@4 → n5 AppendResponse reject_index=7 n2@4 → n5 AppendResponse reject_index=7 n5@4 leader last=10@4 commit=9@4 applied=9 progress={1:0→8 2:0→8 3:9→11 4:9→11} deliver 5 status 5 deliver 1 2 --- n5@4 → n1 Append base=6@3 [] n5@4 → n2 Append base=6@3 [] n5@4 leader last=10@4 commit=9@4 applied=9 progress={1:0→7 2:0→7 3:9→11 4:9→11} n1@4 → n5 AppendResponse reject_index=6 n2@4 → n5 AppendResponse reject_index=6 deliver 5 status 5 deliver 1 2 --- n5@4 → n1 Append base=5@2 [] n5@4 → n2 Append base=5@2 [] n5@4 leader last=10@4 commit=9@4 applied=9 progress={1:0→6 2:0→6 3:9→11 4:9→11} n1@4 → n5 AppendResponse reject_index=5 n2@4 → n5 AppendResponse reject_index=5 deliver 5 status 5 deliver 1 2 --- n5@4 → n1 Append base=4@2 [] n5@4 → n2 Append base=4@2 [] n5@4 leader last=10@4 commit=9@4 applied=9 progress={1:0→5 2:0→5 3:9→11 4:9→11} n1@4 → n5 AppendResponse reject_index=4 n2@4 → n5 AppendResponse reject_index=4 deliver 5 status 5 deliver 1 2 --- n5@4 → n1 Append base=3@1 [] n5@4 → n2 Append base=3@1 [] n5@4 leader last=10@4 commit=9@4 applied=9 progress={1:0→4 2:0→4 3:9→11 4:9→11} n1@4 → n5 AppendResponse match_index=3 n2@4 → n5 AppendResponse match_index=3 # n5 can now replicate the tail to n1 and n2, allowing n5 to commit it. deliver 5 status 5 deliver 1 2 --- n5@4 → n1 Append base=3@1 [4@2 5@2 6@3 7@3 8@4 9@4 10@4] n5@4 → n2 Append base=3@1 [4@2 5@2 6@3 7@3 8@4 9@4 10@4] n5@4 leader last=10@4 commit=9@4 applied=9 progress={1:3→11 2:3→11 3:9→11 4:9→11} n1@4 append 4@2 None n1@4 append 5@2 put c=3 n1@4 append 6@3 None n1@4 append 7@3 put d=4 n1@4 append 8@4 None n1@4 append 9@4 put e=5 n1@4 append 10@4 put f=6 n1@4 → n5 AppendResponse match_index=10 n2@4 append 4@2 None n2@4 append 5@2 put c=3 n2@4 append 6@3 None n2@4 append 7@3 put d=4 n2@4 append 8@4 None n2@4 append 9@4 put e=5 n2@4 append 10@4 put f=6 n2@4 → n5 AppendResponse match_index=10 deliver 5 --- n5@4 commit 10@4 n5@4 apply 10@4 put f=6 n5@4 → c5 ClientResponse id=0x0f write 0x010a c5@4 put f=6 ⇒ 10 status --- n1@4 follower(n5) last=10@4 commit=3@1 applied=3 n2@4 follower(n5) last=10@4 commit=1@1 applied=1 n3@4 follower(n5) last=9@4 commit=9@4 applied=9 n4@4 follower(n5) last=9@4 commit=9@4 applied=9 n5@4 leader last=10@4 commit=10@4 applied=10 progress={1:10→11 2:10→11 3:9→11 4:9→11} # Stabilize the cluster. (stabilize heartbeat=true) status --- n1@4 follower(n5) last=10@4 commit=10@4 applied=10 n2@4 follower(n5) last=10@4 commit=10@4 applied=10 n3@4 follower(n5) last=10@4 commit=10@4 applied=10 n4@4 follower(n5) last=10@4 commit=10@4 applied=10 n5@4 leader last=10@4 commit=10@4 applied=10 progress={1:10→11 2:10→11 3:10→11 4:10→11} ================================================ FILE: src/raft/testscripts/node/append_probe_divergent_short ================================================ # Appends to a previous leader and follower with a shorter divergent tail skips # the missing entries before probing. cluster nodes=5 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 n4@1 follower(n1) last=1@1 commit=1@1 applied=1 n5@1 follower(n1) last=1@1 commit=1@1 applied=1 # Make a couple of writes to ensure a common log prefix. (put 1 a=1) (put 1 b=2) (stabilize) status --- n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4 4:3→4 5:3→4} n2@1 follower(n1) last=3@1 commit=1@1 applied=1 n3@1 follower(n1) last=3@1 commit=1@1 applied=1 n4@1 follower(n1) last=3@1 commit=1@1 applied=1 n5@1 follower(n1) last=3@1 commit=1@1 applied=1 # Partition n1-n2 partition 1 2 --- n1 n2 ⇹ n3 n4 n5 # Elect new leaders in the majority partition and replicate a few writes. # Multiple leaders ensures the log has multiple terms. (campaign 3) (stabilize) (put 3 c=3) (stabilize heartbeat=true) status --- n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4 4:3→4 5:3→4} n2@1 follower(n1) last=3@1 commit=1@1 applied=1 n3@2 leader last=5@2 commit=5@2 applied=5 progress={1:0→6 2:0→6 4:5→6 5:5→6} n4@2 follower(n3) last=5@2 commit=5@2 applied=5 n5@2 follower(n3) last=5@2 commit=5@2 applied=5 (campaign 4) (stabilize) (put 4 d=4) (stabilize heartbeat=true) status --- n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4 4:3→4 5:3→4} n2@1 follower(n1) last=3@1 commit=1@1 applied=1 n3@3 follower(n4) last=7@3 commit=7@3 applied=7 n4@3 leader last=7@3 commit=7@3 applied=7 progress={1:0→8 2:0→8 3:7→8 5:7→8} n5@3 follower(n4) last=7@3 commit=7@3 applied=7 (campaign 5) (stabilize) (put 5 e=5) (stabilize heartbeat=true) status --- n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4 4:3→4 5:3→4} n2@1 follower(n1) last=3@1 commit=1@1 applied=1 n3@4 follower(n5) last=9@4 commit=9@4 applied=9 n4@4 follower(n5) last=9@4 commit=9@4 applied=9 n5@4 leader last=9@4 commit=9@4 applied=9 progress={1:0→10 2:0→10 3:9→10 4:9→10} # Propose a single write in the minority partition. The divergent minority log # is much shorter than the majority log. (put 1 a=2) (stabilize) status --- n1@1 leader last=4@1 commit=3@1 applied=3 progress={2:4→5 3:3→5 4:3→5 5:3→5} n2@1 follower(n1) last=4@1 commit=1@1 applied=1 n3@4 follower(n5) last=9@4 commit=9@4 applied=9 n4@4 follower(n5) last=9@4 commit=9@4 applied=9 n5@4 leader last=9@4 commit=9@4 applied=9 progress={1:0→10 2:0→10 3:9→10 4:9→10} log 1 5 --- n1@1 term=1 last=4@1 commit=3@1 vote=Some(1) n1@1 entry 1@1 None n1@1 entry 2@1 put a=1 n1@1 entry 3@1 put b=2 n1@1 entry 4@1 put a=2 n5@4 term=4 last=9@4 commit=9@4 vote=Some(5) n5@4 entry 1@1 None n5@4 entry 2@1 put a=1 n5@4 entry 3@1 put b=2 n5@4 entry 4@2 None n5@4 entry 5@2 put c=3 n5@4 entry 6@3 None n5@4 entry 7@3 put d=4 n5@4 entry 8@4 None n5@4 entry 9@4 put e=5 # Heal the partition. heal --- n1 n2 n3 n4 n5 fully connected # Propose another write on the majority leader. put 5 f=6 --- c5@4 → n5 ClientRequest id=0x07 write 0x0101660136 n5@4 append 10@4 put f=6 n5@4 → n1 Append base=9@4 [10@4] n5@4 → n2 Append base=9@4 [10@4] n5@4 → n3 Append base=9@4 [10@4] n5@4 → n4 Append base=9@4 [10@4] # Delivering the appends to n1 and n2 should reject them, but with a # reject_index=5 after their last index instead of the original base 9. It also # cancels the in-flight write requests on n1. deliver 1 2 --- n1@1 leader ⇨ n1@4 follower(n5) n1@1 → c1 ClientResponse id=0x06 Error::Abort c1@1 put a=2 ⇒ Error::Abort (operation aborted) n1@4 → n5 AppendResponse reject_index=5 n2@1 follower(n1) ⇨ n2@4 follower(n5) n2@4 → n5 AppendResponse reject_index=5 # n5 will probe the previous base, which is again rejected. This repeats until # a common base is found at 3@1. deliver 5 status 5 deliver 1 2 --- n5@4 → n1 Append base=4@2 [] n5@4 → n2 Append base=4@2 [] n5@4 leader last=10@4 commit=9@4 applied=9 progress={1:0→5 2:0→5 3:9→11 4:9→11} n1@4 → n5 AppendResponse reject_index=4 n2@4 → n5 AppendResponse reject_index=4 deliver 5 deliver 1 2 status 5 --- n5@4 → n1 Append base=3@1 [] n5@4 → n2 Append base=3@1 [] n1@4 → n5 AppendResponse match_index=3 n2@4 → n5 AppendResponse match_index=3 n5@4 leader last=10@4 commit=9@4 applied=9 progress={1:0→4 2:0→4 3:9→11 4:9→11} # n5 can now replicate the tail to n1 and n2, allowing n5 to commit it. deliver 5 deliver 1 2 --- n5@4 → n1 Append base=3@1 [4@2 5@2 6@3 7@3 8@4 9@4 10@4] n5@4 → n2 Append base=3@1 [4@2 5@2 6@3 7@3 8@4 9@4 10@4] n1@4 append 4@2 None n1@4 append 5@2 put c=3 n1@4 append 6@3 None n1@4 append 7@3 put d=4 n1@4 append 8@4 None n1@4 append 9@4 put e=5 n1@4 append 10@4 put f=6 n1@4 → n5 AppendResponse match_index=10 n2@4 append 4@2 None n2@4 append 5@2 put c=3 n2@4 append 6@3 None n2@4 append 7@3 put d=4 n2@4 append 8@4 None n2@4 append 9@4 put e=5 n2@4 append 10@4 put f=6 n2@4 → n5 AppendResponse match_index=10 deliver 5 --- n5@4 commit 10@4 n5@4 apply 10@4 put f=6 n5@4 → c5 ClientResponse id=0x07 write 0x010a c5@4 put f=6 ⇒ 10 status --- n1@4 follower(n5) last=10@4 commit=3@1 applied=3 n2@4 follower(n5) last=10@4 commit=1@1 applied=1 n3@4 follower(n5) last=9@4 commit=9@4 applied=9 n4@4 follower(n5) last=9@4 commit=9@4 applied=9 n5@4 leader last=10@4 commit=10@4 applied=10 progress={1:10→11 2:10→11 3:9→11 4:9→11} # Stabilize the cluster. (stabilize heartbeat=true) status --- n1@4 follower(n5) last=10@4 commit=10@4 applied=10 n2@4 follower(n5) last=10@4 commit=10@4 applied=10 n3@4 follower(n5) last=10@4 commit=10@4 applied=10 n4@4 follower(n5) last=10@4 commit=10@4 applied=10 n5@4 leader last=10@4 commit=10@4 applied=10 progress={1:10→11 2:10→11 3:10→11 4:10→11} ================================================ FILE: src/raft/testscripts/node/append_probe_divergent_single ================================================ # An append replaces a conflict at the tail for a single term. cluster nodes=5 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 n4@1 follower(n1) last=1@1 commit=1@1 applied=1 n5@1 follower(n1) last=1@1 commit=1@1 applied=1 # Partition n3-n5. partition 3 4 5 --- n1 n2 ⇹ n3 n4 n5 # Propose and replicate a write in the minority partition. put 1 a=1 stabilize --- c1@1 → n1 ClientRequest id=0x01 write 0x0101610131 n1@1 append 2@1 put a=1 n1@1 → n2 Append base=1@1 [2@1] n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶ n1@1 ⇥ n4 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶ n1@1 ⇥ n5 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶ n2@1 append 2@1 put a=1 n2@1 → n1 AppendResponse match_index=2 log 1 2 --- n1@1 term=1 last=2@1 commit=1@1 vote=Some(1) n1@1 entry 1@1 None n1@1 entry 2@1 put a=1 n2@1 term=1 last=2@1 commit=1@1 vote=Some(1) n2@1 entry 1@1 None n2@1 entry 2@1 put a=1 # Elect n5 as a new majority partition leader. It appends an empty entry. (campaign 5) (stabilize heartbeat=true) status --- n1@1 leader last=2@1 commit=1@1 applied=1 progress={2:2→3 3:1→3 4:1→3 5:1→3} n2@1 follower(n1) last=2@1 commit=1@1 applied=1 n3@2 follower(n5) last=2@2 commit=2@2 applied=2 n4@2 follower(n5) last=2@2 commit=2@2 applied=2 n5@2 leader last=2@2 commit=2@2 applied=2 progress={1:0→3 2:0→3 3:2→3 4:2→3} # Heal the partition and propose a new write. heal put 5 b=2 --- n1 n2 n3 n4 n5 fully connected c5@2 → n5 ClientRequest id=0x02 write 0x0101620132 n5@2 append 3@2 put b=2 n5@2 → n1 Append base=2@2 [3@2] n5@2 → n2 Append base=2@2 [3@2] n5@2 → n3 Append base=2@2 [3@2] n5@2 → n4 Append base=2@2 [3@2] # Delivering the append messages to n1,n2 will make them follow n5 and # reject the appends due to a log mismatch. deliver 1 2 --- n1@1 leader ⇨ n1@2 follower(n5) n1@1 → c1 ClientResponse id=0x01 Error::Abort c1@1 put a=1 ⇒ Error::Abort (operation aborted) n1@2 → n5 AppendResponse reject_index=2 n2@1 follower(n1) ⇨ n2@2 follower(n5) n2@2 → n5 AppendResponse reject_index=2 # n5 probes index 1, which succeeds. 1 and 2 still has the old logs. deliver 5 deliver 1 2 --- n5@2 → n1 Append base=1@1 [] n5@2 → n2 Append base=1@1 [] n1@2 → n5 AppendResponse match_index=1 n2@2 → n5 AppendResponse match_index=1 log 1 2 --- n1@2 term=2 last=2@1 commit=1@1 vote=None n1@2 entry 1@1 None n1@2 entry 2@1 put a=1 n2@2 term=2 last=2@1 commit=1@1 vote=None n2@2 entry 1@1 None n2@2 entry 2@1 put a=1 # n5 now replicates the tail of its log, which replaces the old logs. deliver 5 deliver 1 2 --- n5@2 → n1 Append base=1@1 [2@2 3@2] n5@2 → n2 Append base=1@1 [2@2 3@2] n1@2 append 2@2 None n1@2 append 3@2 put b=2 n1@2 → n5 AppendResponse match_index=3 n2@2 append 2@2 None n2@2 append 3@2 put b=2 n2@2 → n5 AppendResponse match_index=3 log 1 2 --- n1@2 term=2 last=3@2 commit=1@1 vote=None n1@2 entry 1@1 None n1@2 entry 2@2 None n1@2 entry 3@2 put b=2 n2@2 term=2 last=3@2 commit=1@1 vote=None n2@2 entry 1@1 None n2@2 entry 2@2 None n2@2 entry 3@2 put b=2 # Stabilize the cluster. (stabilize heartbeat=true) status --- n1@2 follower(n5) last=3@2 commit=3@2 applied=3 n2@2 follower(n5) last=3@2 commit=3@2 applied=3 n3@2 follower(n5) last=3@2 commit=3@2 applied=3 n4@2 follower(n5) last=3@2 commit=3@2 applied=3 n5@2 leader last=3@2 commit=3@2 applied=3 progress={1:3→4 2:3→4 3:3→4 4:3→4} ================================================ FILE: src/raft/testscripts/node/append_response_beyond_last_index_panics ================================================ # A successful AppendResponse with last index beyond leader's last log # should panic. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Propose a write. put 1 foo=bar --- c1@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172 n1@1 append 2@1 put foo=bar n1@1 → n2 Append base=1@1 [2@1] n1@1 → n3 Append base=1@1 [2@1] # An AppendResponse beyond leader's last log should panic. !step 1 '{"from":2, "to":1, "term":1, "message":{"AppendResponse":{"match_index":3,"reject_index":0}}}' --- Panic: future match index ================================================ FILE: src/raft/testscripts/node/append_response_stale_reject ================================================ # A successful AppendResponse with a reject_index below the match index # should be ignored. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Replicate a write. (put 1 a=1) (stabilize heartbeat=true) status --- n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:2→3} n2@1 follower(n1) last=2@1 commit=2@1 applied=2 n3@1 follower(n1) last=2@1 commit=2@1 applied=2 # Propose a few writes. (put 1 b=2) (put 1 c=3) status --- n1@1 leader last=4@1 commit=2@1 applied=2 progress={2:2→5 3:2→5} n2@1 follower(n1) last=2@1 commit=2@1 applied=2 n3@1 follower(n1) last=2@1 commit=2@1 applied=2 # A reject_index below the follower's progress match index is ignored. step 1 '{"from":2,"to":1,"term":1,"message":{"AppendResponse":{"match_index":0,"reject_index":2}}}' status --- n1@1 leader last=4@1 commit=2@1 applied=2 progress={2:2→5 3:2→5} n2@1 follower(n1) last=2@1 commit=2@1 applied=2 n3@1 follower(n1) last=2@1 commit=2@1 applied=2 step 1 '{"from":2,"to":1,"term":1,"message":{"AppendResponse":{"match_index":0,"reject_index":1}}}' status --- n1@1 leader last=4@1 commit=2@1 applied=2 progress={2:2→5 3:2→5} n2@1 follower(n1) last=2@1 commit=2@1 applied=2 n3@1 follower(n1) last=2@1 commit=2@1 applied=2 # The writes are still replicated without any probes. stabilize --- n2@1 append 3@1 put b=2 n2@1 → n1 AppendResponse match_index=3 n2@1 append 4@1 put c=3 n2@1 → n1 AppendResponse match_index=4 n3@1 append 3@1 put b=2 n3@1 → n1 AppendResponse match_index=3 n3@1 append 4@1 put c=3 n3@1 → n1 AppendResponse match_index=4 n1@1 commit 3@1 n1@1 apply 3@1 put b=2 n1@1 → c1 ClientResponse id=0x02 write 0x0103 c1@1 put b=2 ⇒ 3 n1@1 commit 4@1 n1@1 apply 4@1 put c=3 n1@1 → c1 ClientResponse id=0x03 write 0x0104 c1@1 put c=3 ⇒ 4 ================================================ FILE: src/raft/testscripts/node/election ================================================ # A node campaigns and wins leadership once the election timeout passes. Uses # ticks directly to also test tick handling. cluster nodes=3 heartbeat_interval=1 election_timeout=2 --- n1@0 follower() last=0@0 commit=0@0 applied=0 n2@0 follower() last=0@0 commit=0@0 applied=0 n3@0 follower() last=0@0 commit=0@0 applied=0 # Tick all nodes. Then tick n1 again to make it campaign. tick --- ok tick 1 --- n1@0 follower() ⇨ n1@1 candidate n1@1 → n2 Campaign last=0@0 n1@1 → n3 Campaign last=0@0 # n2,n3 grant n1 their votes. deliver --- n2@0 follower() ⇨ n2@1 follower() n2@1 → n1 CampaignResponse vote=true n3@0 follower() ⇨ n3@1 follower() n3@1 → n1 CampaignResponse vote=true # n1 wins the election and becomes leader. deliver --- n1@1 candidate ⇨ n1@1 leader n1@1 append 1@1 None n1@1 → n2 Append base=0@0 [1@1] n1@1 → n3 Append base=0@0 [1@1] n1@1 → n2 Heartbeat last_index=1 commit_index=0 read_seq=0 n1@1 → n3 Heartbeat last_index=1 commit_index=0 read_seq=0 # All nodes become n1 followers. stabilize --- n2@1 follower() ⇨ n2@1 follower(n1) n2@1 append 1@1 None n2@1 → n1 AppendResponse match_index=1 n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0 n3@1 follower() ⇨ n3@1 follower(n1) n3@1 append 1@1 None n3@1 → n1 AppendResponse match_index=1 n3@1 → n1 HeartbeatResponse match_index=1 read_seq=0 n1@1 commit 1@1 n1@1 apply 1@1 None # n1's heartbeats are accepted by followers, who commit and apply the entry. tick 1 --- n1@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0 n1@1 → n3 Heartbeat last_index=1 commit_index=1 read_seq=0 stabilize --- n2@1 commit 1@1 n2@1 apply 1@1 None n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0 n3@1 commit 1@1 n3@1 apply 1@1 None n3@1 → n1 HeartbeatResponse match_index=1 read_seq=0 status --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 ================================================ FILE: src/raft/testscripts/node/election_candidate_behind_leader ================================================ # A candidate that lags behind the leader can still win the election # as long as it isn't behind the quorum. cluster nodes=5 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 n4@1 follower(n1) last=1@1 commit=1@1 applied=1 n5@1 follower(n1) last=1@1 commit=1@1 applied=1 # Partition n1+n2 away from the cluster. partition 1 2 --- n1 n2 ⇹ n3 n4 n5 # Replica a write on n1+n2. The write can't be committed, because n1 doesn't # have quorum. (put 1 foo=bar) (stabilize) status --- n1@1 leader last=2@1 commit=1@1 applied=1 progress={2:2→3 3:1→3 4:1→3 5:1→3} n2@1 follower(n1) last=2@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 n4@1 follower(n1) last=1@1 commit=1@1 applied=1 n5@1 follower(n1) last=1@1 commit=1@1 applied=1 # List the logs on n1 n2 n3 to show the replicated but uncommitted entry. log 1 2 3 --- n1@1 term=1 last=2@1 commit=1@1 vote=Some(1) n1@1 entry 1@1 None n1@1 entry 2@1 put foo=bar n2@1 term=1 last=2@1 commit=1@1 vote=Some(1) n2@1 entry 1@1 None n2@1 entry 2@1 put foo=bar n3@1 term=1 last=1@1 commit=1@1 vote=Some(1) n3@1 entry 1@1 None # Heal the partition. heal --- n1 n2 n3 n4 n5 fully connected # Make n5 campaign. n3+n4 grant their votes, n1+n2 reject it. n1 aborts the # in-flight write request because the term changes. campaign 5 deliver --- n5@1 follower(n1) ⇨ n5@2 candidate n5@2 → n1 Campaign last=1@1 n5@2 → n2 Campaign last=1@1 n5@2 → n3 Campaign last=1@1 n5@2 → n4 Campaign last=1@1 n1@1 leader ⇨ n1@2 follower() n1@1 → c1 ClientResponse id=0x01 Error::Abort c1@1 put foo=bar ⇒ Error::Abort (operation aborted) n1@2 → n5 CampaignResponse vote=false n2@1 follower(n1) ⇨ n2@2 follower() n2@2 → n5 CampaignResponse vote=false n3@1 follower(n1) ⇨ n3@2 follower() n3@2 → n5 CampaignResponse vote=true n4@1 follower(n1) ⇨ n4@2 follower() n4@2 → n5 CampaignResponse vote=true # n5 wins the election and becomes leader. stabilize heartbeat=true --- n5@2 candidate ⇨ n5@2 leader n5@2 append 2@2 None n5@2 → n1 Append base=1@1 [2@2] n5@2 → n2 Append base=1@1 [2@2] n5@2 → n3 Append base=1@1 [2@2] n5@2 → n4 Append base=1@1 [2@2] n5@2 → n1 Heartbeat last_index=2 commit_index=1 read_seq=0 n5@2 → n2 Heartbeat last_index=2 commit_index=1 read_seq=0 n5@2 → n3 Heartbeat last_index=2 commit_index=1 read_seq=0 n5@2 → n4 Heartbeat last_index=2 commit_index=1 read_seq=0 n1@2 follower() ⇨ n1@2 follower(n5) n1@2 append 2@2 None n1@2 → n5 AppendResponse match_index=2 n1@2 → n5 HeartbeatResponse match_index=2 read_seq=0 n2@2 follower() ⇨ n2@2 follower(n5) n2@2 append 2@2 None n2@2 → n5 AppendResponse match_index=2 n2@2 → n5 HeartbeatResponse match_index=2 read_seq=0 n3@2 follower() ⇨ n3@2 follower(n5) n3@2 append 2@2 None n3@2 → n5 AppendResponse match_index=2 n3@2 → n5 HeartbeatResponse match_index=2 read_seq=0 n4@2 follower() ⇨ n4@2 follower(n5) n4@2 append 2@2 None n4@2 → n5 AppendResponse match_index=2 n4@2 → n5 HeartbeatResponse match_index=2 read_seq=0 n5@2 commit 2@2 n5@2 apply 2@2 None n5@2 → n1 Heartbeat last_index=2 commit_index=2 read_seq=0 n5@2 → n2 Heartbeat last_index=2 commit_index=2 read_seq=0 n5@2 → n3 Heartbeat last_index=2 commit_index=2 read_seq=0 n5@2 → n4 Heartbeat last_index=2 commit_index=2 read_seq=0 n1@2 commit 2@2 n1@2 apply 2@2 None n1@2 → n5 HeartbeatResponse match_index=2 read_seq=0 n2@2 commit 2@2 n2@2 apply 2@2 None n2@2 → n5 HeartbeatResponse match_index=2 read_seq=0 n3@2 commit 2@2 n3@2 apply 2@2 None n3@2 → n5 HeartbeatResponse match_index=2 read_seq=0 n4@2 commit 2@2 n4@2 apply 2@2 None n4@2 → n5 HeartbeatResponse match_index=2 read_seq=0 # n1+n2's in-flight write at log position 2 has been replaced by the # empty log entry appended by n5 when it became leader. log 1 2 --- n1@2 term=2 last=2@2 commit=2@2 vote=None n1@2 entry 1@1 None n1@2 entry 2@2 None n2@2 term=2 last=2@2 commit=2@2 vote=None n2@2 entry 1@1 None n2@2 entry 2@2 None status --- n1@2 follower(n5) last=2@2 commit=2@2 applied=2 n2@2 follower(n5) last=2@2 commit=2@2 applied=2 n3@2 follower(n5) last=2@2 commit=2@2 applied=2 n4@2 follower(n5) last=2@2 commit=2@2 applied=2 n5@2 leader last=2@2 commit=2@2 applied=2 progress={1:2→3 2:2→3 3:2→3 4:2→3} ================================================ FILE: src/raft/testscripts/node/election_candidate_behind_quorum ================================================ # A candidate that lags behind the quorum can't win an election. cluster nodes=5 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 n4@1 follower(n1) last=1@1 commit=1@1 applied=1 n5@1 follower(n1) last=1@1 commit=1@1 applied=1 # Partition n4+n5 away from the cluster. partition 4 5 --- n4 n5 ⇹ n1 n2 n3 # Replicate a write on n1. n4+n5 now lag behind the quorum. Don't yet propagate # the commit index to n2+n3, to make sure it won't grant the vote just because # n5 is caught up to their local view of the commit index. (put 1 foo=bar) (stabilize) status --- n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:2→3 4:1→3 5:1→3} n2@1 follower(n1) last=2@1 commit=1@1 applied=1 n3@1 follower(n1) last=2@1 commit=1@1 applied=1 n4@1 follower(n1) last=1@1 commit=1@1 applied=1 n5@1 follower(n1) last=1@1 commit=1@1 applied=1 # Heal the partition. heal --- n1 n2 n3 n4 n5 fully connected # Make n5 campaign. n4 grants its vote, but the others reject it because it is # behind the quorum. However, the term bump will convert the other nodes to # leaderless followers. heal campaign 5 stabilize --- n1 n2 n3 n4 n5 fully connected n5@1 follower(n1) ⇨ n5@2 candidate n5@2 → n1 Campaign last=1@1 n5@2 → n2 Campaign last=1@1 n5@2 → n3 Campaign last=1@1 n5@2 → n4 Campaign last=1@1 n1@1 leader ⇨ n1@2 follower() n1@2 → n5 CampaignResponse vote=false n2@1 follower(n1) ⇨ n2@2 follower() n2@2 → n5 CampaignResponse vote=false n3@1 follower(n1) ⇨ n3@2 follower() n3@2 → n5 CampaignResponse vote=false n4@1 follower(n1) ⇨ n4@2 follower() n4@2 → n5 CampaignResponse vote=true status --- n1@2 follower() last=2@1 commit=2@1 applied=2 n2@2 follower() last=2@1 commit=1@1 applied=1 n3@2 follower() last=2@1 commit=1@1 applied=1 n4@2 follower() last=1@1 commit=1@1 applied=1 n5@2 candidate last=1@1 commit=1@1 applied=1 # n2 can campaign and win the election. (campaign 2) (stabilize heartbeat=true) status --- n1@3 follower(n2) last=3@3 commit=3@3 applied=3 n2@3 leader last=3@3 commit=3@3 applied=3 progress={1:3→4 3:3→4 4:3→4 5:3→4} n3@3 follower(n2) last=3@3 commit=3@3 applied=3 n4@3 follower(n2) last=3@3 commit=3@3 applied=3 n5@3 follower(n2) last=3@3 commit=3@3 applied=3 ================================================ FILE: src/raft/testscripts/node/election_contested ================================================ # A leader can be elected even when there are multiple candidates. cluster nodes=5 election_timeout=2 --- n1@0 follower() last=0@0 commit=0@0 applied=0 n2@0 follower() last=0@0 commit=0@0 applied=0 n3@0 follower() last=0@0 commit=0@0 applied=0 n4@0 follower() last=0@0 commit=0@0 applied=0 n5@0 follower() last=0@0 commit=0@0 applied=0 # n1 and n5 campaign. tick tick 1 5 --- n1@0 follower() ⇨ n1@1 candidate n1@1 → n2 Campaign last=0@0 n1@1 → n3 Campaign last=0@0 n1@1 → n4 Campaign last=0@0 n1@1 → n5 Campaign last=0@0 n5@0 follower() ⇨ n5@1 candidate n5@1 → n1 Campaign last=0@0 n5@1 → n2 Campaign last=0@0 n5@1 → n3 Campaign last=0@0 n5@1 → n4 Campaign last=0@0 # n1 and n5 ignore each other, since they're both campaigning. deliver 1 5 --- n1@1 → n5 CampaignResponse vote=false n5@1 → n1 CampaignResponse vote=false # n1 reaches n2,n3 first, but n5 reaches n4 first. deliver 2 3 deliver 4 from=5 deliver 4 --- n2@0 follower() ⇨ n2@1 follower() n2@1 → n1 CampaignResponse vote=true n2@1 → n5 CampaignResponse vote=false n3@0 follower() ⇨ n3@1 follower() n3@1 → n1 CampaignResponse vote=true n3@1 → n5 CampaignResponse vote=false n4@0 follower() ⇨ n4@1 follower() n4@1 → n5 CampaignResponse vote=true n4@1 → n1 CampaignResponse vote=false # n1 and n5 receive their votes. n1 has quorum and becomes leader. deliver --- n1@1 candidate ⇨ n1@1 leader n1@1 append 1@1 None n1@1 → n2 Append base=0@0 [1@1] n1@1 → n3 Append base=0@0 [1@1] n1@1 → n4 Append base=0@0 [1@1] n1@1 → n5 Append base=0@0 [1@1] n1@1 → n2 Heartbeat last_index=1 commit_index=0 read_seq=0 n1@1 → n3 Heartbeat last_index=1 commit_index=0 read_seq=0 n1@1 → n4 Heartbeat last_index=1 commit_index=0 read_seq=0 n1@1 → n5 Heartbeat last_index=1 commit_index=0 read_seq=0 # All nodes accept n1 as leader in term 1 and become followers. stabilize --- n2@1 follower() ⇨ n2@1 follower(n1) n2@1 append 1@1 None n2@1 → n1 AppendResponse match_index=1 n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0 n3@1 follower() ⇨ n3@1 follower(n1) n3@1 append 1@1 None n3@1 → n1 AppendResponse match_index=1 n3@1 → n1 HeartbeatResponse match_index=1 read_seq=0 n4@1 follower() ⇨ n4@1 follower(n1) n4@1 append 1@1 None n4@1 → n1 AppendResponse match_index=1 n4@1 → n1 HeartbeatResponse match_index=1 read_seq=0 n5@1 candidate ⇨ n5@1 follower(n1) n5@1 append 1@1 None n5@1 → n1 AppendResponse match_index=1 n5@1 → n1 HeartbeatResponse match_index=1 read_seq=0 n1@1 commit 1@1 n1@1 apply 1@1 None status --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2} n2@1 follower(n1) last=1@1 commit=0@0 applied=0 n3@1 follower(n1) last=1@1 commit=0@0 applied=0 n4@1 follower(n1) last=1@1 commit=0@0 applied=0 n5@1 follower(n1) last=1@1 commit=0@0 applied=0 ================================================ FILE: src/raft/testscripts/node/election_tie ================================================ # No leader can be elected with an election tie. cluster nodes=3 election_timeout=2 --- n1@0 follower() last=0@0 commit=0@0 applied=0 n2@0 follower() last=0@0 commit=0@0 applied=0 n3@0 follower() last=0@0 commit=0@0 applied=0 # Tick all nodes twice to make them all campaign. tick tick --- n1@0 follower() ⇨ n1@1 candidate n1@1 → n2 Campaign last=0@0 n1@1 → n3 Campaign last=0@0 n2@0 follower() ⇨ n2@1 candidate n2@1 → n1 Campaign last=0@0 n2@1 → n3 Campaign last=0@0 n3@0 follower() ⇨ n3@1 candidate n3@1 → n1 Campaign last=0@0 n3@1 → n2 Campaign last=0@0 # Stabilizing the cluster will not result in a leader. stabilize --- n1@1 → n2 CampaignResponse vote=false n1@1 → n3 CampaignResponse vote=false n2@1 → n1 CampaignResponse vote=false n2@1 → n3 CampaignResponse vote=false n3@1 → n1 CampaignResponse vote=false n3@1 → n2 CampaignResponse vote=false status --- n1@1 candidate last=0@0 commit=0@0 applied=0 n2@1 candidate last=0@0 commit=0@0 applied=0 n3@1 candidate last=0@0 commit=0@0 applied=0 # A node can call another election in a new term and win. tick 2 tick 2 --- n2@1 candidate ⇨ n2@2 candidate n2@2 → n1 Campaign last=0@0 n2@2 → n3 Campaign last=0@0 deliver --- n1@1 candidate ⇨ n1@2 follower() n1@2 → n2 CampaignResponse vote=true n3@1 candidate ⇨ n3@2 follower() n3@2 → n2 CampaignResponse vote=true deliver --- n2@2 candidate ⇨ n2@2 leader n2@2 append 1@2 None n2@2 → n1 Append base=0@0 [1@2] n2@2 → n3 Append base=0@0 [1@2] n2@2 → n1 Heartbeat last_index=1 commit_index=0 read_seq=0 n2@2 → n3 Heartbeat last_index=1 commit_index=0 read_seq=0 stabilize --- n1@2 follower() ⇨ n1@2 follower(n2) n1@2 append 1@2 None n1@2 → n2 AppendResponse match_index=1 n1@2 → n2 HeartbeatResponse match_index=1 read_seq=0 n3@2 follower() ⇨ n3@2 follower(n2) n3@2 append 1@2 None n3@2 → n2 AppendResponse match_index=1 n3@2 → n2 HeartbeatResponse match_index=1 read_seq=0 n2@2 commit 1@2 n2@2 apply 1@2 None status --- n1@2 follower(n2) last=1@2 commit=0@0 applied=0 n2@2 leader last=1@2 commit=1@2 applied=1 progress={1:1→2 3:1→2} n3@2 follower(n2) last=1@2 commit=0@0 applied=0 ================================================ FILE: src/raft/testscripts/node/election_tie_even ================================================ # No leader can be elected with an election tie between an even number of nodes. cluster nodes=4 election_timeout=2 --- n1@0 follower() last=0@0 commit=0@0 applied=0 n2@0 follower() last=0@0 commit=0@0 applied=0 n3@0 follower() last=0@0 commit=0@0 applied=0 n4@0 follower() last=0@0 commit=0@0 applied=0 # n1 and n4 campaign. tick tick 1 4 --- n1@0 follower() ⇨ n1@1 candidate n1@1 → n2 Campaign last=0@0 n1@1 → n3 Campaign last=0@0 n1@1 → n4 Campaign last=0@0 n4@0 follower() ⇨ n4@1 candidate n4@1 → n1 Campaign last=0@0 n4@1 → n2 Campaign last=0@0 n4@1 → n3 Campaign last=0@0 # n2 votes for n1, n3 votes for n4. deliver 2 deliver 3 from=4 deliver 3 --- n2@0 follower() ⇨ n2@1 follower() n2@1 → n1 CampaignResponse vote=true n2@1 → n4 CampaignResponse vote=false n3@0 follower() ⇨ n3@1 follower() n3@1 → n4 CampaignResponse vote=true n3@1 → n1 CampaignResponse vote=false # Stabilizing the cluster will not result in a leader. stabilize --- n1@1 → n4 CampaignResponse vote=false n4@1 → n1 CampaignResponse vote=false status --- n1@1 candidate last=0@0 commit=0@0 applied=0 n2@1 follower() last=0@0 commit=0@0 applied=0 n3@1 follower() last=0@0 commit=0@0 applied=0 n4@1 candidate last=0@0 commit=0@0 applied=0 # A node can call another election in a new term and win. tick 3 tick 3 --- n3@1 follower() ⇨ n3@2 candidate n3@2 → n1 Campaign last=0@0 n3@2 → n2 Campaign last=0@0 n3@2 → n4 Campaign last=0@0 deliver --- n1@1 candidate ⇨ n1@2 follower() n1@2 → n3 CampaignResponse vote=true n2@1 follower() ⇨ n2@2 follower() n2@2 → n3 CampaignResponse vote=true n4@1 candidate ⇨ n4@2 follower() n4@2 → n3 CampaignResponse vote=true deliver --- n3@2 candidate ⇨ n3@2 leader n3@2 append 1@2 None n3@2 → n1 Append base=0@0 [1@2] n3@2 → n2 Append base=0@0 [1@2] n3@2 → n4 Append base=0@0 [1@2] n3@2 → n1 Heartbeat last_index=1 commit_index=0 read_seq=0 n3@2 → n2 Heartbeat last_index=1 commit_index=0 read_seq=0 n3@2 → n4 Heartbeat last_index=1 commit_index=0 read_seq=0 stabilize --- n1@2 follower() ⇨ n1@2 follower(n3) n1@2 append 1@2 None n1@2 → n3 AppendResponse match_index=1 n1@2 → n3 HeartbeatResponse match_index=1 read_seq=0 n2@2 follower() ⇨ n2@2 follower(n3) n2@2 append 1@2 None n2@2 → n3 AppendResponse match_index=1 n2@2 → n3 HeartbeatResponse match_index=1 read_seq=0 n4@2 follower() ⇨ n4@2 follower(n3) n4@2 append 1@2 None n4@2 → n3 AppendResponse match_index=1 n4@2 → n3 HeartbeatResponse match_index=1 read_seq=0 n3@2 commit 1@2 n3@2 apply 1@2 None status --- n1@2 follower(n3) last=1@2 commit=0@0 applied=0 n2@2 follower(n3) last=1@2 commit=0@0 applied=0 n3@2 leader last=1@2 commit=1@2 applied=1 progress={1:1→2 2:1→2 4:1→2} n4@2 follower(n3) last=1@2 commit=0@0 applied=0 ================================================ FILE: src/raft/testscripts/node/heartbeat_commits_follower ================================================ # A heartbeat will commit and apply an entry on a follower. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Write on the leader, which replicates then commits and applies locally. put 1 foo=bar stabilize --- c1@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172 n1@1 append 2@1 put foo=bar n1@1 → n2 Append base=1@1 [2@1] n1@1 → n3 Append base=1@1 [2@1] n2@1 append 2@1 put foo=bar n2@1 → n1 AppendResponse match_index=2 n3@1 append 2@1 put foo=bar n3@1 → n1 AppendResponse match_index=2 n1@1 commit 2@1 n1@1 apply 2@1 put foo=bar n1@1 → c1 ClientResponse id=0x01 write 0x0102 c1@1 put foo=bar ⇒ 2 # The write has been replicated, but not yet committed and applied on followers. status --- n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:2→3} n2@1 follower(n1) last=2@1 commit=1@1 applied=1 n3@1 follower(n1) last=2@1 commit=1@1 applied=1 # A heartbeat commits and applies on followers. heartbeat 1 stabilize --- n1@1 → n2 Heartbeat last_index=2 commit_index=2 read_seq=0 n1@1 → n3 Heartbeat last_index=2 commit_index=2 read_seq=0 n2@1 commit 2@1 n2@1 apply 2@1 put foo=bar n2@1 → n1 HeartbeatResponse match_index=2 read_seq=0 n3@1 commit 2@1 n3@1 apply 2@1 put foo=bar n3@1 → n1 HeartbeatResponse match_index=2 read_seq=0 status --- n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:2→3} n2@1 follower(n1) last=2@1 commit=2@1 applied=2 n3@1 follower(n1) last=2@1 commit=2@1 applied=2 ================================================ FILE: src/raft/testscripts/node/heartbeat_converts_candidate ================================================ # A heartbeat from a leader should convert a candidate in the same term to a # follower. cluster nodes=3 --- n1@0 follower() last=0@0 commit=0@0 applied=0 n2@0 follower() last=0@0 commit=0@0 applied=0 n3@0 follower() last=0@0 commit=0@0 applied=0 # Partition n3 away from the cluster. partition 3 --- n3 ⇹ n1 n2 # Both n1 and n3 campaign. n2 votes for n1. campaign 1 3 deliver --- n1@0 follower() ⇨ n1@1 candidate n1@1 → n2 Campaign last=0@0 n1@1 ⇥ n3 C̶a̶m̶p̶a̶i̶g̶n̶ ̶l̶a̶s̶t̶=̶0̶@̶0̶ n3@0 follower() ⇨ n3@1 candidate n3@1 ⇥ n1 C̶a̶m̶p̶a̶i̶g̶n̶ ̶l̶a̶s̶t̶=̶0̶@̶0̶ n3@1 ⇥ n2 C̶a̶m̶p̶a̶i̶g̶n̶ ̶l̶a̶s̶t̶=̶0̶@̶0̶ n2@0 follower() ⇨ n2@1 follower() n2@1 → n1 CampaignResponse vote=true # n1 assumes leadership and heartbeats, committing entry 1. (stabilize heartbeat=true) status --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:0→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 candidate last=0@0 commit=0@0 applied=0 # Heal the partition. heal --- n1 n2 n3 fully connected # The next heartbeat from n1 converts n3 to a follower in term 1. heartbeat 1 stabilize --- n1@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0 n1@1 → n3 Heartbeat last_index=1 commit_index=1 read_seq=0 n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0 n3@1 candidate ⇨ n3@1 follower(n1) n3@1 → n1 HeartbeatResponse match_index=0 read_seq=0 n1@1 → n3 Append base=0@0 [1@1] n3@1 append 1@1 None n3@1 → n1 AppendResponse match_index=1 status --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=0@0 applied=0 ================================================ FILE: src/raft/testscripts/node/heartbeat_converts_follower ================================================ # A heartbeat from a leader should convert a follower of a different leader in a # past term to a follower. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Partition n2 away from the cluster. partition 2 --- n2 ⇹ n1 n3 # Elect n3 as a new leader. (campaign 3) (stabilize heartbeat=true) status --- n1@2 follower(n3) last=2@2 commit=2@2 applied=2 n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@2 leader last=2@2 commit=2@2 applied=2 progress={1:2→3 2:0→3} # Heal the partition. heal --- n1 n2 n3 fully connected # The next heartbeat from n3 converts n2 to a follower in term 2. heartbeat 3 stabilize heartbeat=true --- n3@2 → n1 Heartbeat last_index=2 commit_index=2 read_seq=0 n3@2 → n2 Heartbeat last_index=2 commit_index=2 read_seq=0 n1@2 → n3 HeartbeatResponse match_index=2 read_seq=0 n2@1 follower(n1) ⇨ n2@2 follower(n3) n2@2 → n3 HeartbeatResponse match_index=0 read_seq=0 n3@2 → n2 Append base=1@1 [] n2@2 → n3 AppendResponse match_index=1 n3@2 → n2 Append base=1@1 [2@2] n2@2 append 2@2 None n2@2 → n3 AppendResponse match_index=2 n3@2 → n1 Heartbeat last_index=2 commit_index=2 read_seq=0 n3@2 → n2 Heartbeat last_index=2 commit_index=2 read_seq=0 n1@2 → n3 HeartbeatResponse match_index=2 read_seq=0 n2@2 commit 2@2 n2@2 apply 2@2 None n2@2 → n3 HeartbeatResponse match_index=2 read_seq=0 status --- n1@2 follower(n3) last=2@2 commit=2@2 applied=2 n2@2 follower(n3) last=2@2 commit=2@2 applied=2 n3@2 leader last=2@2 commit=2@2 applied=2 progress={1:2→3 2:2→3} ================================================ FILE: src/raft/testscripts/node/heartbeat_converts_follower_leaderless ================================================ # A heartbeat from a leader should convert a leaderless follower. cluster nodes=3 --- n1@0 follower() last=0@0 commit=0@0 applied=0 n2@0 follower() last=0@0 commit=0@0 applied=0 n3@0 follower() last=0@0 commit=0@0 applied=0 # Partition n3 away from the cluster. partition 3 --- n3 ⇹ n1 n2 # Elect n1 as a new leader. (campaign 1) (stabilize heartbeat=true) status --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:0→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@0 follower() last=0@0 commit=0@0 applied=0 # Heal the partition. heal --- n1 n2 n3 fully connected # The next heartbeat from n1 converts n3 to a follower in term 1. heartbeat 1 stabilize --- n1@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0 n1@1 → n3 Heartbeat last_index=1 commit_index=1 read_seq=0 n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0 n3@0 follower() ⇨ n3@1 follower(n1) n3@1 → n1 HeartbeatResponse match_index=0 read_seq=0 n1@1 → n3 Append base=0@0 [1@1] n3@1 append 1@1 None n3@1 → n1 AppendResponse match_index=1 status --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=0@0 applied=0 ================================================ FILE: src/raft/testscripts/node/heartbeat_converts_leader ================================================ # A heartbeat from a leader should convert a leader in a past term to a # follower. cluster nodes=3 leader=3 --- n1@1 follower(n3) last=1@1 commit=1@1 applied=1 n2@1 follower(n3) last=1@1 commit=1@1 applied=1 n3@1 leader last=1@1 commit=1@1 applied=1 progress={1:1→2 2:1→2} # Partition n3 away from the cluster. partition 3 --- n3 ⇹ n1 n2 # Elect n1 as a new leader. (campaign 1) (stabilize heartbeat=true) status --- n1@2 leader last=2@2 commit=2@2 applied=2 progress={2:2→3 3:0→3} n2@2 follower(n1) last=2@2 commit=2@2 applied=2 n3@1 leader last=1@1 commit=1@1 applied=1 progress={1:1→2 2:1→2} # Heal the partition. heal --- n1 n2 n3 fully connected # The next heartbeat from n1 converts n3 to a follower in term 2. heartbeat 1 stabilize heartbeat=true --- n1@2 → n2 Heartbeat last_index=2 commit_index=2 read_seq=0 n1@2 → n3 Heartbeat last_index=2 commit_index=2 read_seq=0 n2@2 → n1 HeartbeatResponse match_index=2 read_seq=0 n3@1 leader ⇨ n3@2 follower(n1) n3@2 → n1 HeartbeatResponse match_index=0 read_seq=0 n1@2 → n3 Append base=1@1 [] n3@2 → n1 AppendResponse match_index=1 n1@2 → n3 Append base=1@1 [2@2] n3@2 append 2@2 None n3@2 → n1 AppendResponse match_index=2 n1@2 → n2 Heartbeat last_index=2 commit_index=2 read_seq=0 n1@2 → n3 Heartbeat last_index=2 commit_index=2 read_seq=0 n2@2 → n1 HeartbeatResponse match_index=2 read_seq=0 n3@2 commit 2@2 n3@2 apply 2@2 None n3@2 → n1 HeartbeatResponse match_index=2 read_seq=0 status --- n1@2 leader last=2@2 commit=2@2 applied=2 progress={2:2→3 3:2→3} n2@2 follower(n1) last=2@2 commit=2@2 applied=2 n3@2 follower(n1) last=2@2 commit=2@2 applied=2 ================================================ FILE: src/raft/testscripts/node/heartbeat_lost_append_duplicate ================================================ # Duplicate heartbeats and responses with a lost append will # trigger duplicate resends, but it will eventually resolve. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Partition the leader, submit a write whose appends are dropped, # then heal the partition again. partition 1 --- n1 ⇹ n2 n3 put 1 foo=bar --- c1@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172 n1@1 append 2@1 put foo=bar n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶ n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶ heal --- n1 n2 n3 fully connected # The next heartbeat will result in match_index=0 since the followers # don't have the last_index. 3 heartbeats are made. heartbeat 1 heartbeat 1 heartbeat 1 deliver --- n1@1 → n2 Heartbeat last_index=2 commit_index=1 read_seq=0 n1@1 → n3 Heartbeat last_index=2 commit_index=1 read_seq=0 n1@1 → n2 Heartbeat last_index=2 commit_index=1 read_seq=0 n1@1 → n3 Heartbeat last_index=2 commit_index=1 read_seq=0 n1@1 → n2 Heartbeat last_index=2 commit_index=1 read_seq=0 n1@1 → n3 Heartbeat last_index=2 commit_index=1 read_seq=0 n2@1 → n1 HeartbeatResponse match_index=0 read_seq=0 n2@1 → n1 HeartbeatResponse match_index=0 read_seq=0 n2@1 → n1 HeartbeatResponse match_index=0 read_seq=0 n3@1 → n1 HeartbeatResponse match_index=0 read_seq=0 n3@1 → n1 HeartbeatResponse match_index=0 read_seq=0 n3@1 → n1 HeartbeatResponse match_index=0 read_seq=0 # The leader has previously matched the followers at index 1. status 1 --- n1@1 leader last=2@1 commit=1@1 applied=1 progress={2:1→3 3:1→3} # When it receives the heartbeat responses, it sends duplicates of the missing # entries. deliver --- n1@1 → n2 Append base=1@1 [2@1] n1@1 → n2 Append base=1@1 [2@1] n1@1 → n2 Append base=1@1 [2@1] n1@1 → n3 Append base=1@1 [2@1] n1@1 → n3 Append base=1@1 [2@1] n1@1 → n3 Append base=1@1 [2@1] # The followers accept the duplicate appends and the leader commits and applies. stabilize --- n2@1 append 2@1 put foo=bar n2@1 → n1 AppendResponse match_index=2 n2@1 → n1 AppendResponse match_index=2 n2@1 → n1 AppendResponse match_index=2 n3@1 append 2@1 put foo=bar n3@1 → n1 AppendResponse match_index=2 n3@1 → n1 AppendResponse match_index=2 n3@1 → n1 AppendResponse match_index=2 n1@1 commit 2@1 n1@1 apply 2@1 put foo=bar n1@1 → c1 ClientResponse id=0x01 write 0x0102 c1@1 put foo=bar ⇒ 2 ================================================ FILE: src/raft/testscripts/node/heartbeat_lost_append_multiple ================================================ # A heartbeat response triggers a probe and resend of lost appends. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Partition the leader, submit three writes whose appends are dropped, then heal # the partition again. partition 1 --- n1 ⇹ n2 n3 put 1 a=1 put 1 b=2 put 1 c=3 --- c1@1 → n1 ClientRequest id=0x01 write 0x0101610131 n1@1 append 2@1 put a=1 n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶ n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶ c1@1 → n1 ClientRequest id=0x02 write 0x0101620132 n1@1 append 3@1 put b=2 n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶2̶@̶1̶ ̶[̶3̶@̶1̶]̶ n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶2̶@̶1̶ ̶[̶3̶@̶1̶]̶ c1@1 → n1 ClientRequest id=0x03 write 0x0101630133 n1@1 append 4@1 put c=3 n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶3̶@̶1̶ ̶[̶4̶@̶1̶]̶ n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶3̶@̶1̶ ̶[̶4̶@̶1̶]̶ heal status --- n1 n2 n3 fully connected n1@1 leader last=4@1 commit=1@1 applied=1 progress={2:1→5 3:1→5} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # The next heartbeat will result in match_index=0 since the followers # don't have the last_index. heartbeat 1 deliver --- n1@1 → n2 Heartbeat last_index=4 commit_index=1 read_seq=0 n1@1 → n3 Heartbeat last_index=4 commit_index=1 read_seq=0 n2@1 → n1 HeartbeatResponse match_index=0 read_seq=0 n3@1 → n1 HeartbeatResponse match_index=0 read_seq=0 # The leader has previously matched the followers at index 1. status 1 --- n1@1 leader last=4@1 commit=1@1 applied=1 progress={2:1→5 3:1→5} # When it receives the heartbeat response, it probes the previous index 3. deliver --- n1@1 → n2 Append base=3@1 [] n1@1 → n3 Append base=3@1 [] # The followers don't have index 3. They don't have index 2 either, but they # do have 1, so they respond with a reject_index=2. deliver --- n2@1 → n1 AppendResponse reject_index=2 n3@1 → n1 AppendResponse reject_index=2 # The leader has already matched index 1, so it doesn't have to probe for it, # and can simply send the tail of the log. deliver --- n1@1 → n2 Append base=1@1 [2@1 3@1 4@1] n1@1 → n3 Append base=1@1 [2@1 3@1 4@1] # The followers accept the append and the leader commits and applies. stabilize --- n2@1 append 2@1 put a=1 n2@1 append 3@1 put b=2 n2@1 append 4@1 put c=3 n2@1 → n1 AppendResponse match_index=4 n3@1 append 2@1 put a=1 n3@1 append 3@1 put b=2 n3@1 append 4@1 put c=3 n3@1 → n1 AppendResponse match_index=4 n1@1 commit 4@1 n1@1 apply 2@1 put a=1 n1@1 apply 3@1 put b=2 n1@1 apply 4@1 put c=3 n1@1 → c1 ClientResponse id=0x01 write 0x0102 c1@1 put a=1 ⇒ 2 n1@1 → c1 ClientResponse id=0x02 write 0x0103 c1@1 put b=2 ⇒ 3 n1@1 → c1 ClientResponse id=0x03 write 0x0104 c1@1 put c=3 ⇒ 4 ================================================ FILE: src/raft/testscripts/node/heartbeat_lost_append_single ================================================ # A heartbeat response triggers a resend of a lost append. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Partition the leader, submit a write whose appends are dropped, # then heal the partition again. partition 1 --- n1 ⇹ n2 n3 put 1 foo=bar --- c1@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172 n1@1 append 2@1 put foo=bar n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶ n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶ heal --- n1 n2 n3 fully connected # The next heartbeat will result in match_index=0 since the followers # don't have the last_index. heartbeat 1 deliver --- n1@1 → n2 Heartbeat last_index=2 commit_index=1 read_seq=0 n1@1 → n3 Heartbeat last_index=2 commit_index=1 read_seq=0 n2@1 → n1 HeartbeatResponse match_index=0 read_seq=0 n3@1 → n1 HeartbeatResponse match_index=0 read_seq=0 # The leader has previously matched the followers at index 1. status 1 --- n1@1 leader last=2@1 commit=1@1 applied=1 progress={2:1→3 3:1→3} # When it receives the heartbeat response, instead of probing index 1 and then # sending the actual entries, it simply sends the entries. deliver --- n1@1 → n2 Append base=1@1 [2@1] n1@1 → n3 Append base=1@1 [2@1] # The followers accept the append and the leader commits and applies. stabilize --- n2@1 append 2@1 put foo=bar n2@1 → n1 AppendResponse match_index=2 n3@1 append 2@1 put foo=bar n3@1 → n1 AppendResponse match_index=2 n1@1 commit 2@1 n1@1 apply 2@1 put foo=bar n1@1 → c1 ClientResponse id=0x01 write 0x0102 c1@1 put foo=bar ⇒ 2 ================================================ FILE: src/raft/testscripts/node/heartbeat_lost_read ================================================ # Heartbeats will recover from a lost read message. cluster nodes=5 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 n4@1 follower(n1) last=1@1 commit=1@1 applied=1 n5@1 follower(n1) last=1@1 commit=1@1 applied=1 # Write a key and replicate it. (put 1 foo=bar) (stabilize heartbeat=true) --- ok # Partition the leader, and submit a read. partition 1 --- n1 ⇹ n2 n3 n4 n5 get 1 foo --- c1@1 → n1 ClientRequest id=0x02 read 0x0003666f6f n1@1 ⇥ n2 R̶e̶a̶d̶ ̶s̶e̶q̶=̶1̶ n1@1 ⇥ n3 R̶e̶a̶d̶ ̶s̶e̶q̶=̶1̶ n1@1 ⇥ n4 R̶e̶a̶d̶ ̶s̶e̶q̶=̶1̶ n1@1 ⇥ n5 R̶e̶a̶d̶ ̶s̶e̶q̶=̶1̶ heal --- n1 n2 n3 n4 n5 fully connected # The next heartbeat will detect the failed read, and serve it when # it has a quorum. heartbeat 1 deliver --- n1@1 → n2 Heartbeat last_index=2 commit_index=2 read_seq=1 n1@1 → n3 Heartbeat last_index=2 commit_index=2 read_seq=1 n1@1 → n4 Heartbeat last_index=2 commit_index=2 read_seq=1 n1@1 → n5 Heartbeat last_index=2 commit_index=2 read_seq=1 n2@1 → n1 HeartbeatResponse match_index=2 read_seq=1 n3@1 → n1 HeartbeatResponse match_index=2 read_seq=1 n4@1 → n1 HeartbeatResponse match_index=2 read_seq=1 n5@1 → n1 HeartbeatResponse match_index=2 read_seq=1 # The first response does not provide quorum. deliver 1 from=2 --- ok # The second does, and the read is served. deliver 1 from=3 --- n1@1 → c1 ClientResponse id=0x02 read 0x000103626172 c1@1 get foo ⇒ bar ================================================ FILE: src/raft/testscripts/node/heartbeat_match_commits ================================================ # A heartbeat response can advance a follower match index and commit+apply. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Submit a write to the leader. put 1 foo=bar --- c1@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172 n1@1 append 2@1 put foo=bar n1@1 → n2 Append base=1@1 [2@1] n1@1 → n3 Append base=1@1 [2@1] # Partition n1 away from the followers as they send the append acks, then heal # the partition. partition 1 --- n1 ⇹ n2 n3 stabilize --- n2@1 append 2@1 put foo=bar n2@1 ⇥ n1 A̶p̶p̶e̶n̶d̶R̶e̶s̶p̶o̶n̶s̶e̶ ̶m̶a̶t̶c̶h̶_̶i̶n̶d̶e̶x̶=̶2̶ n3@1 append 2@1 put foo=bar n3@1 ⇥ n1 A̶p̶p̶e̶n̶d̶R̶e̶s̶p̶o̶n̶s̶e̶ ̶m̶a̶t̶c̶h̶_̶i̶n̶d̶e̶x̶=̶2̶ heal --- n1 n2 n3 fully connected # The write has been replicated, but not yet committed and applied. status --- n1@1 leader last=2@1 commit=1@1 applied=1 progress={2:1→3 3:1→3} n2@1 follower(n1) last=2@1 commit=1@1 applied=1 n3@1 follower(n1) last=2@1 commit=1@1 applied=1 # The leader heartbeats. The followers confirm they are caught up. heartbeat 1 deliver --- n1@1 → n2 Heartbeat last_index=2 commit_index=1 read_seq=0 n1@1 → n3 Heartbeat last_index=2 commit_index=1 read_seq=0 n2@1 → n1 HeartbeatResponse match_index=2 read_seq=0 n3@1 → n1 HeartbeatResponse match_index=2 read_seq=0 # When the leader receives the first heartbeat, it commits and applies # the write. deliver 1 from=2 --- n1@1 commit 2@1 n1@1 apply 2@1 put foo=bar n1@1 → c1 ClientResponse id=0x01 write 0x0102 c1@1 put foo=bar ⇒ 2 status 1 --- n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:1→3} # Delivery of the second heartbeat advances the match index, but # there is nothing more to do. deliver status --- n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:2→3} n2@1 follower(n1) last=2@1 commit=1@1 applied=1 n3@1 follower(n1) last=2@1 commit=1@1 applied=1 ================================================ FILE: src/raft/testscripts/node/heartbeat_multiple_leaders_panic ================================================ # A heartbeat will panic if there are multiple leaders in a term. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Leader panics if it sees another leader in the same term. !step 1 '{"from":2, "to":1, "term":1, "message": {"Heartbeat":{"last_index":1,"commit_index":0, "commit_term":0, "read_seq":0}}}' --- Panic: saw other leader 2 in term 1 # Follower panics too. !step 2 '{"from":3, "to":2, "term":1, "message": {"Heartbeat":{"last_index":1,"commit_index":0, "commit_term":0, "read_seq":0}}}' --- Panic: assertion `left == right` failed: multiple leaders in term left: 3 right: 1 ================================================ FILE: src/raft/testscripts/node/heartbeat_old_commit_index ================================================ # A heartbeat with an old commit index is ignored by a follower. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Replicate a write. (put 1 foo=bar) (stabilize heartbeat=true) status --- n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:2→3} n2@1 follower(n1) last=2@1 commit=2@1 applied=2 n3@1 follower(n1) last=2@1 commit=2@1 applied=2 # Step a heartbeat with an outdated commit index. step 2 '{"from":1, "to":2, "term":1, "message":{"Heartbeat":{"last_index":2,"commit_index":1,"commit_term":1,"read_seq":0}}}' stabilize --- n2@1 → n1 HeartbeatResponse match_index=2 read_seq=0 ================================================ FILE: src/raft/testscripts/node/heartbeat_old_last_index ================================================ # A heartbeat with an old last index is matched by a follower. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Replicate a write. (put 1 foo=bar) (stabilize heartbeat=true) status --- n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:2→3} n2@1 follower(n1) last=2@1 commit=2@1 applied=2 n3@1 follower(n1) last=2@1 commit=2@1 applied=2 # Step a heartbeat with an outdated last index. step 2 '{"from":1, "to":2, "term":1, "message":{"Heartbeat":{"last_index":1,"commit_index":1,"commit_term":1,"read_seq":0}}}' stabilize --- n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0 ================================================ FILE: src/raft/testscripts/node/heartbeat_probe_divergent ================================================ # A heartbeat while the leader is probing a follower with a long divergent tail # doesn't disrupt the probing, and won't result in a quadratically increasing # amount of probes with each heartbeat. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Make a couple of writes to ensure a common log prefix. (put 1 a=1) (put 1 b=2) (stabilize) status --- n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4} n2@1 follower(n1) last=3@1 commit=1@1 applied=1 n3@1 follower(n1) last=3@1 commit=1@1 applied=1 # Partition n1 partition 1 --- n1 ⇹ n2 n3 # Elect new leaders in the majority partition and replicate a few writes. (campaign 2) (stabilize) (put 2 c=3) (put 2 d=4) (stabilize heartbeat=true) status --- n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4} n2@2 leader last=6@2 commit=6@2 applied=6 progress={1:0→7 3:6→7} n3@2 follower(n2) last=6@2 commit=6@2 applied=6 (campaign 3) (stabilize) (put 2 e=5) (put 2 f=6) (stabilize heartbeat=true) status --- n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4} n2@3 follower(n3) last=9@3 commit=9@3 applied=9 n3@3 leader last=9@3 commit=9@3 applied=9 progress={1:0→10 2:9→10} # Propose writes in the minority partition as well, to build up a divergent log. (put 1 a=2) (put 1 a=3) (put 1 a=4) (put 1 a=5) (put 1 a=6) (put 1 a=7) (put 1 a=8) (put 1 a=9) (stabilize) status --- n1@1 leader last=11@1 commit=3@1 applied=3 progress={2:3→12 3:3→12} n2@3 follower(n3) last=9@3 commit=9@3 applied=9 n3@3 leader last=9@3 commit=9@3 applied=9 progress={1:0→10 2:9→10} # Heal the partition. heal --- n1 n2 n3 fully connected # Propose another write on the majority leader to start probing. put 3 g=7 --- c3@3 → n3 ClientRequest id=0x0f write 0x0101670137 n3@3 append 10@3 put g=7 n3@3 → n1 Append base=9@3 [10@3] n3@3 → n2 Append base=9@3 [10@3] # The append should be rejected by n1, canceling the writes. deliver 1 --- n1@1 leader ⇨ n1@3 follower(n3) n1@1 → c1 ClientResponse id=0x07 Error::Abort c1@1 put a=2 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x08 Error::Abort c1@1 put a=3 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x09 Error::Abort c1@1 put a=4 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x0a Error::Abort c1@1 put a=5 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x0b Error::Abort c1@1 put a=6 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x0c Error::Abort c1@1 put a=7 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x0d Error::Abort c1@1 put a=8 ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x0e Error::Abort c1@1 put a=9 ⇒ Error::Abort (operation aborted) n1@3 → n3 AppendResponse reject_index=9 # n3 begins probing, and also heartbeats. deliver 3 heartbeat 3 deliver 1 status 3 --- n3@3 → n1 Append base=8@3 [] n3@3 → n1 Heartbeat last_index=10 commit_index=9 read_seq=0 n3@3 → n2 Heartbeat last_index=10 commit_index=9 read_seq=0 n1@3 → n3 AppendResponse reject_index=8 n1@3 → n3 HeartbeatResponse match_index=0 read_seq=0 n3@3 leader last=10@3 commit=9@3 applied=9 progress={1:0→9 2:9→11} # n3 receives probe and heartbeat responses, resulting in duplicate # probes being sent at base index 7. deliver 3 status 3 --- n3@3 → n1 Append base=7@3 [] n3@3 → n1 Append base=7@3 [] n3@3 leader last=10@3 commit=9@3 applied=9 progress={1:0→8 2:9→11} deliver 1 --- n1@3 → n3 AppendResponse reject_index=7 n1@3 → n3 AppendResponse reject_index=7 # However, when receiving the duplicate probe responses, they are # deduplicated and only a single new probe is sent. deliver 3 --- n3@3 → n1 Append base=6@2 [] deliver 1 --- n1@3 → n3 AppendResponse reject_index=6 # n3 heartbeats again before sending the next probe. This results in # two probes: the heartbeat response resends the probe at base 5, while # the probe response triggers a new probe at base 4. heartbeat 3 deliver 3 --- n3@3 → n1 Heartbeat last_index=10 commit_index=9 read_seq=0 n3@3 → n2 Heartbeat last_index=10 commit_index=9 read_seq=0 n3@3 → n1 Append base=5@2 [] deliver 1 --- n1@3 → n3 HeartbeatResponse match_index=0 read_seq=0 n1@3 → n3 AppendResponse reject_index=5 deliver 3 --- n3@3 → n1 Append base=5@2 [] n3@3 → n1 Append base=4@2 [] deliver 1 --- n1@3 → n3 AppendResponse reject_index=5 n1@3 → n3 AppendResponse reject_index=4 # The probe response at reject_index=5 is ignored, since we're already probed # it. Only a single new probe is sent at base 4. deliver 3 --- n3@3 → n1 Append base=3@1 [] # When delivered, we finally get a match, and the follower gets caught up. deliver 1 --- n1@3 → n3 AppendResponse match_index=3 deliver 3 --- n3@3 → n1 Append base=3@1 [4@2 5@2 6@2 7@3 8@3 9@3 10@3] deliver 1 --- n1@3 append 4@2 None n1@3 append 5@2 put c=3 n1@3 append 6@2 put d=4 n1@3 append 7@3 None n1@3 append 8@3 put e=5 n1@3 append 9@3 put f=6 n1@3 append 10@3 put g=7 n1@3 → n3 AppendResponse match_index=10 deliver 3 --- n3@3 commit 10@3 n3@3 apply 10@3 put g=7 n3@3 → c3 ClientResponse id=0x0f write 0x010a c3@3 put g=7 ⇒ 10 ================================================ FILE: src/raft/testscripts/node/old_campaign_rejected ================================================ # Old campaign messages (in the same term) are ignored by leaders and followers # once a leader is elected. cluster nodes=3 --- n1@0 follower() last=0@0 commit=0@0 applied=0 n2@0 follower() last=0@0 commit=0@0 applied=0 n3@0 follower() last=0@0 commit=0@0 applied=0 # n1 and n2 campaign. campaign 1 2 --- n1@0 follower() ⇨ n1@1 candidate n1@1 → n2 Campaign last=0@0 n1@1 → n3 Campaign last=0@0 n2@0 follower() ⇨ n2@1 candidate n2@1 → n1 Campaign last=0@0 n2@1 → n3 Campaign last=0@0 # n3 receives n1's Campaign message and grants its vote. deliver 3 from=1 --- n3@0 follower() ⇨ n3@1 follower() n3@1 → n1 CampaignResponse vote=true # n1 becomes leader. deliver 1 from=3 --- n1@1 candidate ⇨ n1@1 leader n1@1 append 1@1 None n1@1 → n2 Append base=0@0 [1@1] n1@1 → n3 Append base=0@0 [1@1] n1@1 → n2 Heartbeat last_index=1 commit_index=0 read_seq=0 n1@1 → n3 Heartbeat last_index=1 commit_index=0 read_seq=0 # n3 receives n1's heartbeat and becomes follower. deliver 3 from=1 --- n3@1 follower() ⇨ n3@1 follower(n1) n3@1 append 1@1 None n3@1 → n1 AppendResponse match_index=1 n3@1 → n1 HeartbeatResponse match_index=1 read_seq=0 status --- n1@1 leader last=1@1 commit=0@0 applied=0 progress={2:0→2 3:0→2} n2@1 candidate last=0@0 commit=0@0 applied=0 n3@1 follower(n1) last=1@1 commit=0@0 applied=0 # n1 and n3 receive n2's Campaign message and reject it. deliver 1 3 from=2 --- n1@1 → n2 CampaignResponse vote=false n3@1 → n2 CampaignResponse vote=false # Stabilizing the cluster results in everyone following n1. (stabilize heartbeat=true) status --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 ================================================ FILE: src/raft/testscripts/node/old_campaign_response_ignored ================================================ # Old campaign responses (in the same term) are ignored by leaders and followers # once a leader is elected. cluster nodes=7 --- n1@0 follower() last=0@0 commit=0@0 applied=0 n2@0 follower() last=0@0 commit=0@0 applied=0 n3@0 follower() last=0@0 commit=0@0 applied=0 n4@0 follower() last=0@0 commit=0@0 applied=0 n5@0 follower() last=0@0 commit=0@0 applied=0 n6@0 follower() last=0@0 commit=0@0 applied=0 n7@0 follower() last=0@0 commit=0@0 applied=0 # n1 and n2 campaign. campaign 1 2 --- n1@0 follower() ⇨ n1@1 candidate n1@1 → n2 Campaign last=0@0 n1@1 → n3 Campaign last=0@0 n1@1 → n4 Campaign last=0@0 n1@1 → n5 Campaign last=0@0 n1@1 → n6 Campaign last=0@0 n1@1 → n7 Campaign last=0@0 n2@0 follower() ⇨ n2@1 candidate n2@1 → n1 Campaign last=0@0 n2@1 → n3 Campaign last=0@0 n2@1 → n4 Campaign last=0@0 n2@1 → n5 Campaign last=0@0 n2@1 → n6 Campaign last=0@0 n2@1 → n7 Campaign last=0@0 # n3-n6 vote for n1, n7 votes for n2. deliver 3 4 5 6 from=1 deliver 7 from=2 --- n3@0 follower() ⇨ n3@1 follower() n3@1 → n1 CampaignResponse vote=true n4@0 follower() ⇨ n4@1 follower() n4@1 → n1 CampaignResponse vote=true n5@0 follower() ⇨ n5@1 follower() n5@1 → n1 CampaignResponse vote=true n6@0 follower() ⇨ n6@1 follower() n6@1 → n1 CampaignResponse vote=true n7@0 follower() ⇨ n7@1 follower() n7@1 → n2 CampaignResponse vote=true # n1 receives votes from n3-n5 and assumes leadership. deliver 1 from=3 deliver 1 from=4 deliver 1 from=5 --- n1@1 candidate ⇨ n1@1 leader n1@1 append 1@1 None n1@1 → n2 Append base=0@0 [1@1] n1@1 → n3 Append base=0@0 [1@1] n1@1 → n4 Append base=0@0 [1@1] n1@1 → n5 Append base=0@0 [1@1] n1@1 → n6 Append base=0@0 [1@1] n1@1 → n7 Append base=0@0 [1@1] n1@1 → n2 Heartbeat last_index=1 commit_index=0 read_seq=0 n1@1 → n3 Heartbeat last_index=1 commit_index=0 read_seq=0 n1@1 → n4 Heartbeat last_index=1 commit_index=0 read_seq=0 n1@1 → n5 Heartbeat last_index=1 commit_index=0 read_seq=0 n1@1 → n6 Heartbeat last_index=1 commit_index=0 read_seq=0 n1@1 → n7 Heartbeat last_index=1 commit_index=0 read_seq=0 # n2 receives n1's heartbeats and becomes follower. deliver 2 from=1 --- n2@1 → n1 CampaignResponse vote=false n2@1 candidate ⇨ n2@1 follower(n1) n2@1 append 1@1 None n2@1 → n1 AppendResponse match_index=1 n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0 # n1 (leader) receives n6's vote and ignores it. n2 (follower) receives n7's # vote and ignores it. They remain leader and follower. deliver 1 from=6 deliver 2 from=7 status --- n1@1 leader last=1@1 commit=0@0 applied=0 progress={2:0→2 3:0→2 4:0→2 5:0→2 6:0→2 7:0→2} n2@1 follower(n1) last=1@1 commit=0@0 applied=0 n3@1 follower() last=0@0 commit=0@0 applied=0 n4@1 follower() last=0@0 commit=0@0 applied=0 n5@1 follower() last=0@0 commit=0@0 applied=0 n6@1 follower() last=0@0 commit=0@0 applied=0 n7@1 follower() last=0@0 commit=0@0 applied=0 # Stabilizing the cluster results in everyone following n1. (stabilize heartbeat=true) status --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2 6:1→2 7:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 n4@1 follower(n1) last=1@1 commit=1@1 applied=1 n5@1 follower(n1) last=1@1 commit=1@1 applied=1 n6@1 follower(n1) last=1@1 commit=1@1 applied=1 n7@1 follower(n1) last=1@1 commit=1@1 applied=1 ================================================ FILE: src/raft/testscripts/node/old_heartbeat_ignored ================================================ # A heartbeat from an old leader should be ignored. # Make n3 leader. cluster nodes=3 leader=3 --- n1@1 follower(n3) last=1@1 commit=1@1 applied=1 n2@1 follower(n3) last=1@1 commit=1@1 applied=1 n3@1 leader last=1@1 commit=1@1 applied=1 progress={1:1→2 2:1→2} # Partition n3 away from the cluster. partition 3 --- n3 ⇹ n1 n2 # Elect n1 as a new leader. (campaign 1) (stabilize heartbeat=true) status --- n1@2 leader last=2@2 commit=2@2 applied=2 progress={2:2→3 3:0→3} n2@2 follower(n1) last=2@2 commit=2@2 applied=2 n3@1 leader last=1@1 commit=1@1 applied=1 progress={1:1→2 2:1→2} # Heal the partition. heal --- n1 n2 n3 fully connected # The next heartbeat from n3 is ignored. heartbeat 3 stabilize --- n3@1 → n1 Heartbeat last_index=1 commit_index=1 read_seq=0 n3@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0 status --- n1@2 leader last=2@2 commit=2@2 applied=2 progress={2:2→3 3:0→3} n2@2 follower(n1) last=2@2 commit=2@2 applied=2 n3@1 leader last=1@1 commit=1@1 applied=1 progress={1:1→2 2:1→2} ================================================ FILE: src/raft/testscripts/node/request_candidate_abort ================================================ # Client read/write requests fail on candidates. cluster nodes=3 --- n1@0 follower() last=0@0 commit=0@0 applied=0 n2@0 follower() last=0@0 commit=0@0 applied=0 n3@0 follower() last=0@0 commit=0@0 applied=0 # n1 campaigns. campaign 1 --- n1@0 follower() ⇨ n1@1 candidate n1@1 → n2 Campaign last=0@0 n1@1 → n3 Campaign last=0@0 # A read request on n1 should be rejected. get 1 foo --- c1@1 → n1 ClientRequest id=0x01 read 0x0003666f6f n1@1 → c1 ClientResponse id=0x01 Error::Abort c1@1 get foo ⇒ Error::Abort (operation aborted) # A write request on n1 should be rejected. put 1 foo=bar --- c1@1 → n1 ClientRequest id=0x02 write 0x0103666f6f03626172 n1@1 → c1 ClientResponse id=0x02 Error::Abort c1@1 put foo=bar ⇒ Error::Abort (operation aborted) ================================================ FILE: src/raft/testscripts/node/request_follower ================================================ # Client read/write requests are proxied by followers. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # An initial get on a follower yields None. get 2 foo stabilize --- c2@1 → n2 ClientRequest id=0x01 read 0x0003666f6f n2@1 → n1 ClientRequest id=0x01 read 0x0003666f6f n1@1 → n2 Read seq=1 n1@1 → n3 Read seq=1 n2@1 → n1 ReadResponse seq=1 n3@1 → n1 ReadResponse seq=1 n1@1 → n2 ClientResponse id=0x01 read 0x0000 n2@1 → c2 ClientResponse id=0x01 read 0x0000 c2@1 get foo ⇒ None # Write a value on the follower. put 2 foo=bar stabilize (stabilize heartbeat=true) --- c2@1 → n2 ClientRequest id=0x02 write 0x0103666f6f03626172 n2@1 → n1 ClientRequest id=0x02 write 0x0103666f6f03626172 n1@1 append 2@1 put foo=bar n1@1 → n2 Append base=1@1 [2@1] n1@1 → n3 Append base=1@1 [2@1] n2@1 append 2@1 put foo=bar n2@1 → n1 AppendResponse match_index=2 n3@1 append 2@1 put foo=bar n3@1 → n1 AppendResponse match_index=2 n1@1 commit 2@1 n1@1 apply 2@1 put foo=bar n1@1 → n2 ClientResponse id=0x02 write 0x0102 n2@1 → c2 ClientResponse id=0x02 write 0x0102 c2@1 put foo=bar ⇒ 2 # Read the value back on the follower. get 2 foo stabilize --- c2@1 → n2 ClientRequest id=0x03 read 0x0003666f6f n2@1 → n1 ClientRequest id=0x03 read 0x0003666f6f n1@1 → n2 Read seq=2 n1@1 → n3 Read seq=2 n2@1 → n1 ReadResponse seq=2 n3@1 → n1 ReadResponse seq=2 n1@1 → n2 ClientResponse id=0x03 read 0x000103626172 n2@1 → c2 ClientResponse id=0x03 read 0x000103626172 c2@1 get foo ⇒ bar ================================================ FILE: src/raft/testscripts/node/request_follower_campaign_abort ================================================ # A follower aborts in-flight requests when it steps down. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Submit a read and write on n2. put 2 foo=bar get 2 foo --- c2@1 → n2 ClientRequest id=0x01 write 0x0103666f6f03626172 n2@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172 c2@1 → n2 ClientRequest id=0x02 read 0x0003666f6f n2@1 → n1 ClientRequest id=0x02 read 0x0003666f6f # n3 campaigns before n2's requests achieve quorum. campaign 3 --- n3@1 follower(n1) ⇨ n3@2 candidate n3@2 → n1 Campaign last=1@1 n3@2 → n2 Campaign last=1@1 # When n2 receives the campaign message, the requests are aborted. deliver 2 from=3 --- n2@1 follower(n1) ⇨ n2@2 follower() n2@1 → c2 ClientResponse id=0x01 Error::Abort c2@1 put foo=bar ⇒ Error::Abort (operation aborted) n2@1 → c2 ClientResponse id=0x02 Error::Abort c2@1 get foo ⇒ Error::Abort (operation aborted) n2@2 → n3 CampaignResponse vote=true ================================================ FILE: src/raft/testscripts/node/request_follower_disconnect_stall ================================================ # Client read/write requests stall if the follower is disconnected from the # leader when the request is submitted. They are not retried, nor aborted. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Partition n3 away from the cluster. partition 3 --- n3 ⇹ n1 n2 # Submit write and read requests to n3. They don't return a result. put 3 foo=bar get 3 foo stabilize --- c3@1 → n3 ClientRequest id=0x01 write 0x0103666f6f03626172 n3@1 ⇥ n1 C̶l̶i̶e̶n̶t̶R̶e̶q̶u̶e̶s̶t̶ ̶i̶d̶=̶0̶x̶0̶1̶ ̶w̶r̶i̶t̶e̶ ̶0̶x̶0̶1̶0̶3̶6̶6̶6̶f̶6̶f̶0̶3̶6̶2̶6̶1̶7̶2̶ c3@1 → n3 ClientRequest id=0x02 read 0x0003666f6f n3@1 ⇥ n1 C̶l̶i̶e̶n̶t̶R̶e̶q̶u̶e̶s̶t̶ ̶i̶d̶=̶0̶x̶0̶2̶ ̶r̶e̶a̶d̶ ̶0̶x̶0̶0̶0̶3̶6̶6̶6̶f̶6̶f̶ # Heal the partition and heartbeat. The requests still don't return a result. heal --- n1 n2 n3 fully connected stabilize heartbeat=true --- n1@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0 n1@1 → n3 Heartbeat last_index=1 commit_index=1 read_seq=0 n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0 n3@1 → n1 HeartbeatResponse match_index=1 read_seq=0 ================================================ FILE: src/raft/testscripts/node/request_follower_leaderless_abort ================================================ # Client read/write requests fail on leaderless followers. cluster nodes=3 --- n1@0 follower() last=0@0 commit=0@0 applied=0 n2@0 follower() last=0@0 commit=0@0 applied=0 n3@0 follower() last=0@0 commit=0@0 applied=0 # A read request on n1 should be rejected. get 1 foo --- c1@0 → n1 ClientRequest id=0x01 read 0x0003666f6f n1@0 → c1 ClientResponse id=0x01 Error::Abort c1@0 get foo ⇒ Error::Abort (operation aborted) # A write request on n1 should be rejected. put 1 foo=bar --- c1@0 → n1 ClientRequest id=0x02 write 0x0103666f6f03626172 n1@0 → c1 ClientResponse id=0x02 Error::Abort c1@0 put foo=bar ⇒ Error::Abort (operation aborted) ================================================ FILE: src/raft/testscripts/node/request_leader ================================================ # Client read/write requests succeed on leaders. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # An initial get on the leader yields None. get 1 foo stabilize --- c1@1 → n1 ClientRequest id=0x01 read 0x0003666f6f n1@1 → n2 Read seq=1 n1@1 → n3 Read seq=1 n2@1 → n1 ReadResponse seq=1 n3@1 → n1 ReadResponse seq=1 n1@1 → c1 ClientResponse id=0x01 read 0x0000 c1@1 get foo ⇒ None # Write a value on the leader. put 1 foo=bar stabilize (stabilize heartbeat=true) --- c1@1 → n1 ClientRequest id=0x02 write 0x0103666f6f03626172 n1@1 append 2@1 put foo=bar n1@1 → n2 Append base=1@1 [2@1] n1@1 → n3 Append base=1@1 [2@1] n2@1 append 2@1 put foo=bar n2@1 → n1 AppendResponse match_index=2 n3@1 append 2@1 put foo=bar n3@1 → n1 AppendResponse match_index=2 n1@1 commit 2@1 n1@1 apply 2@1 put foo=bar n1@1 → c1 ClientResponse id=0x02 write 0x0102 c1@1 put foo=bar ⇒ 2 # Read the value back on the leader. get 1 foo stabilize --- c1@1 → n1 ClientRequest id=0x03 read 0x0003666f6f n1@1 → n2 Read seq=2 n1@1 → n3 Read seq=2 n2@1 → n1 ReadResponse seq=2 n3@1 → n1 ReadResponse seq=2 n1@1 → c1 ClientResponse id=0x03 read 0x000103626172 c1@1 get foo ⇒ bar ================================================ FILE: src/raft/testscripts/node/request_leader_campaign_abort ================================================ # A leader aborts in-flight requests when it steps down. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Submit a read and write on n1. put 1 foo=bar get 1 foo --- c1@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172 n1@1 append 2@1 put foo=bar n1@1 → n2 Append base=1@1 [2@1] n1@1 → n3 Append base=1@1 [2@1] c1@1 → n1 ClientRequest id=0x02 read 0x0003666f6f n1@1 → n2 Read seq=1 n1@1 → n3 Read seq=1 # n2 campaigns before n1's requests achieve quorum. campaign 2 --- n2@1 follower(n1) ⇨ n2@2 candidate n2@2 → n1 Campaign last=1@1 n2@2 → n3 Campaign last=1@1 # When n1 receives the campaign message, the requests are aborted. deliver 1 from=2 --- n1@1 leader ⇨ n1@2 follower() n1@1 → c1 ClientResponse id=0x01 Error::Abort c1@1 put foo=bar ⇒ Error::Abort (operation aborted) n1@1 → c1 ClientResponse id=0x02 Error::Abort c1@1 get foo ⇒ Error::Abort (operation aborted) n1@2 → n2 CampaignResponse vote=false ================================================ FILE: src/raft/testscripts/node/request_leader_change_linearizability ================================================ # A new leader that's behind on commit/apply shouldn't serve stale reads. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Write an initial value, and propagate the commit index. (put 1 a=1) (stabilize heartbeat=true) status --- n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:2→3} n2@1 follower(n1) last=2@1 commit=2@1 applied=2 n3@1 follower(n1) last=2@1 commit=2@1 applied=2 # Write another value, but don't propagate the commit index. (put 1 b=2) (stabilize) status --- n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4} n2@1 follower(n1) last=3@1 commit=2@1 applied=2 n3@1 follower(n1) last=3@1 commit=2@1 applied=2 # n2 now campaigns and wins, while being behind on commit/apply. campaign 2 deliver --- n2@1 follower(n1) ⇨ n2@2 candidate n2@2 → n1 Campaign last=3@1 n2@2 → n3 Campaign last=3@1 n1@1 leader ⇨ n1@2 follower() n1@2 → n2 CampaignResponse vote=true n3@1 follower(n1) ⇨ n3@2 follower() n3@2 → n2 CampaignResponse vote=true # The initial append doesn't make it to the followers, so its commit index # trails the previous leader. partition 2 deliver 2 --- n2 ⇹ n1 n3 n2@2 candidate ⇨ n2@2 leader n2@2 append 4@2 None n2@2 ⇥ n1 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶3̶@̶1̶ ̶[̶4̶@̶2̶]̶ n2@2 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶3̶@̶1̶ ̶[̶4̶@̶2̶]̶ n2@2 ⇥ n1 H̶e̶a̶r̶t̶b̶e̶a̶t̶ ̶l̶a̶s̶t̶_̶i̶n̶d̶e̶x̶=̶4̶ ̶c̶o̶m̶m̶i̶t̶_̶i̶n̶d̶e̶x̶=̶2̶ ̶r̶e̶a̶d̶_̶s̶e̶q̶=̶0̶ n2@2 ⇥ n3 H̶e̶a̶r̶t̶b̶e̶a̶t̶ ̶l̶a̶s̶t̶_̶i̶n̶d̶e̶x̶=̶4̶ ̶c̶o̶m̶m̶i̶t̶_̶i̶n̶d̶e̶x̶=̶2̶ ̶r̶e̶a̶d̶_̶s̶e̶q̶=̶0̶ heal status --- n1 n2 n3 fully connected n1@2 follower() last=3@1 commit=3@1 applied=3 n2@2 leader last=4@2 commit=2@1 applied=2 progress={1:0→5 3:0→5} n3@2 follower() last=3@1 commit=2@1 applied=2 # Reading from n2 should not result in a stale read even if followers # confirm the read sequence. get 2 b deliver deliver --- c2@2 → n2 ClientRequest id=0x03 read 0x000162 n2@2 → n1 Read seq=1 n2@2 → n3 Read seq=1 n1@2 follower() ⇨ n1@2 follower(n2) n1@2 → n2 ReadResponse seq=1 n3@2 follower() ⇨ n3@2 follower(n2) n3@2 → n2 ReadResponse seq=1 # The leader heartbeats and detects the lost appends. heartbeat 2 deliver deliver deliver --- n2@2 → n1 Heartbeat last_index=4 commit_index=2 read_seq=1 n2@2 → n3 Heartbeat last_index=4 commit_index=2 read_seq=1 n1@2 → n2 HeartbeatResponse match_index=0 read_seq=1 n3@2 → n2 HeartbeatResponse match_index=0 read_seq=1 n2@2 → n1 Append base=3@1 [] n2@2 → n3 Append base=3@1 [] n1@2 → n2 AppendResponse match_index=3 n3@2 → n2 AppendResponse match_index=3 # It resends the missing log entry. deliver deliver --- n2@2 → n1 Append base=3@1 [4@2] n2@2 → n3 Append base=3@1 [4@2] n1@2 append 4@2 None n1@2 → n2 AppendResponse match_index=4 n3@2 append 4@2 None n3@2 → n2 AppendResponse match_index=4 # Once the leader receives the acks it commits the entry. The read can now be # served, resulting in an up-to-date b=2. stabilize --- n2@2 commit 4@2 n2@2 apply 3@1 put b=2 n2@2 apply 4@2 None n2@2 → c2 ClientResponse id=0x03 read 0x00010132 c2@2 get b ⇒ 2 ================================================ FILE: src/raft/testscripts/node/request_leader_disconnect ================================================ # Client read/write requests succeed if the leader is disconnected from the # quorum when the request is submitted but it later reconnects. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Partition n1 away from the cluster. partition 1 --- n1 ⇹ n2 n3 # Submit write and read requests to n1. They don't return a result. put 1 foo=bar get 1 foo stabilize --- c1@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172 n1@1 append 2@1 put foo=bar n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶ n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶ c1@1 → n1 ClientRequest id=0x02 read 0x0003666f6f n1@1 ⇥ n2 R̶e̶a̶d̶ ̶s̶e̶q̶=̶1̶ n1@1 ⇥ n3 R̶e̶a̶d̶ ̶s̶e̶q̶=̶1̶ # Heal the partition and heartbeat. The requests eventually return results. heal --- n1 n2 n3 fully connected stabilize heartbeat=true --- n1@1 → n2 Heartbeat last_index=2 commit_index=1 read_seq=1 n1@1 → n3 Heartbeat last_index=2 commit_index=1 read_seq=1 n2@1 → n1 HeartbeatResponse match_index=0 read_seq=1 n3@1 → n1 HeartbeatResponse match_index=0 read_seq=1 n1@1 → c1 ClientResponse id=0x02 read 0x0000 c1@1 get foo ⇒ None n1@1 → n2 Append base=1@1 [2@1] n1@1 → n3 Append base=1@1 [2@1] n2@1 append 2@1 put foo=bar n2@1 → n1 AppendResponse match_index=2 n3@1 append 2@1 put foo=bar n3@1 → n1 AppendResponse match_index=2 n1@1 commit 2@1 n1@1 apply 2@1 put foo=bar n1@1 → c1 ClientResponse id=0x01 write 0x0102 c1@1 put foo=bar ⇒ 2 ================================================ FILE: src/raft/testscripts/node/request_leader_read_quorum ================================================ # Client read requests are only processed once a quorum confirms the read sequence. cluster nodes=5 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 n4@1 follower(n1) last=1@1 commit=1@1 applied=1 n5@1 follower(n1) last=1@1 commit=1@1 applied=1 # Write foo=bar. (put 1 foo=bar) (stabilize heartbeat=true) --- ok # Read it once. (get 1 foo) (stabilize) --- ok # Attempt to read it again. The read only returns once a quorum have # confirmed the read sequence. get 1 foo --- c1@1 → n1 ClientRequest id=0x03 read 0x0003666f6f n1@1 → n2 Read seq=2 n1@1 → n3 Read seq=2 n1@1 → n4 Read seq=2 n1@1 → n5 Read seq=2 deliver 2 deliver 1 --- n2@1 → n1 ReadResponse seq=2 deliver 3 deliver 1 --- n3@1 → n1 ReadResponse seq=2 n1@1 → c1 ClientResponse id=0x03 read 0x000103626172 c1@1 get foo ⇒ bar (stabilize) --- ok ================================================ FILE: src/raft/testscripts/node/request_leader_read_quorum_sequence ================================================ # Client read requests are only served once a quorum confirm the read sequence # number, including higher sequence numbers. cluster nodes=5 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 n4@1 follower(n1) last=1@1 commit=1@1 applied=1 n5@1 follower(n1) last=1@1 commit=1@1 applied=1 # Write foo=bar and read it back. (put 1 foo=bar) (stabilize heartbeat=true) (get 1 foo) (stabilize) --- ok # Send a heartbeat with sequence number 1, and deliver it to all followers. heartbeat 1 deliver --- n1@1 → n2 Heartbeat last_index=2 commit_index=2 read_seq=1 n1@1 → n3 Heartbeat last_index=2 commit_index=2 read_seq=1 n1@1 → n4 Heartbeat last_index=2 commit_index=2 read_seq=1 n1@1 → n5 Heartbeat last_index=2 commit_index=2 read_seq=1 n2@1 → n1 HeartbeatResponse match_index=2 read_seq=1 n3@1 → n1 HeartbeatResponse match_index=2 read_seq=1 n4@1 → n1 HeartbeatResponse match_index=2 read_seq=1 n5@1 → n1 HeartbeatResponse match_index=2 read_seq=1 # Partition n1 away. partition 1 --- n1 ⇹ n2 n3 n4 n5 # Perform a read at sequence number 2. The read messages are lost. get 1 foo --- c1@1 → n1 ClientRequest id=0x03 read 0x0003666f6f n1@1 ⇥ n2 R̶e̶a̶d̶ ̶s̶e̶q̶=̶2̶ n1@1 ⇥ n3 R̶e̶a̶d̶ ̶s̶e̶q̶=̶2̶ n1@1 ⇥ n4 R̶e̶a̶d̶ ̶s̶e̶q̶=̶2̶ n1@1 ⇥ n5 R̶e̶a̶d̶ ̶s̶e̶q̶=̶2̶ # Deliver the heartbeat responses at sequence number 1. These should not satisfy # the read at sequence number 2. deliver 1 --- ok # Heal the partition and perform another read at sequence number 3. Followers # respond to the reads at sequence number 3. heal get 1 foo --- n1 n2 n3 n4 n5 fully connected c1@1 → n1 ClientRequest id=0x04 read 0x0003666f6f n1@1 → n2 Read seq=3 n1@1 → n3 Read seq=3 n1@1 → n4 Read seq=3 n1@1 → n5 Read seq=3 deliver --- n2@1 → n1 ReadResponse seq=3 n3@1 → n1 ReadResponse seq=3 n4@1 → n1 ReadResponse seq=3 n5@1 → n1 ReadResponse seq=3 # Once n1 receives two responses it has a read quorum and serves both the read # at seqnums 2 (id=0x03) and 3 (id=0x04). deliver 1 from=3 deliver 1 from=5 --- n1@1 → c1 ClientResponse id=0x03 read 0x000103626172 c1@1 get foo ⇒ bar n1@1 → c1 ClientResponse id=0x04 read 0x000103626172 c1@1 get foo ⇒ bar ================================================ FILE: src/raft/testscripts/node/request_leader_single ================================================ # Client read/write requests succeed on a lone leader. cluster nodes=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={} # An initial get on the leader yields None. get 1 foo stabilize --- c1@1 → n1 ClientRequest id=0x01 read 0x0003666f6f n1@1 → c1 ClientResponse id=0x01 read 0x0000 c1@1 get foo ⇒ None # Write a value on the leader. put 1 foo=bar stabilize heartbeat=true --- c1@1 → n1 ClientRequest id=0x02 write 0x0103666f6f03626172 n1@1 append 2@1 put foo=bar n1@1 commit 2@1 n1@1 apply 2@1 put foo=bar n1@1 → c1 ClientResponse id=0x02 write 0x0102 c1@1 put foo=bar ⇒ 2 # Read the value back on the leader. get 1 foo stabilize --- c1@1 → n1 ClientRequest id=0x03 read 0x0003666f6f n1@1 → c1 ClientResponse id=0x03 read 0x000103626172 c1@1 get foo ⇒ bar ================================================ FILE: src/raft/testscripts/node/request_status ================================================ # Status requests return the cluster status. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Partition away n3, so not all nodes have the same log position. partition 3 --- n3 ⇹ n1 n2 # Replicate a write, but not the commit index. (put 1 foo=bar) (stabilize) status --- n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:1→3} n2@1 follower(n1) last=2@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Run a status request on the leader. status request=true 1 stabilize --- c1@1 → n1 ClientRequest id=0x02 status n1@1 → c1 ClientResponse id=0x02 status Status { leader: 1, term: 1, match_index: {1: 2, 2: 2, 3: 1}, commit_index: 2, applied_index: 2, storage: Status { name: "bitcask", keys: 4, size: 41, disk_size: 84, live_disk_size: 73 } } c1@1 status ⇒ Status { leader: 1, term: 1, match_index: { 1: 2, 2: 2, 3: 1, }, commit_index: 2, applied_index: 2, storage: Status { name: "bitcask", keys: 4, size: 41, disk_size: 84, live_disk_size: 73, }, } # Run a status request on a follower. status request=true 2 stabilize --- c2@1 → n2 ClientRequest id=0x03 status n2@1 → n1 ClientRequest id=0x03 status n1@1 → n2 ClientResponse id=0x03 status Status { leader: 1, term: 1, match_index: {1: 2, 2: 2, 3: 1}, commit_index: 2, applied_index: 2, storage: Status { name: "bitcask", keys: 4, size: 41, disk_size: 84, live_disk_size: 73 } } n2@1 → c2 ClientResponse id=0x03 status Status { leader: 1, term: 1, match_index: {1: 2, 2: 2, 3: 1}, commit_index: 2, applied_index: 2, storage: Status { name: "bitcask", keys: 4, size: 41, disk_size: 84, live_disk_size: 73 } } c2@1 status ⇒ Status { leader: 1, term: 1, match_index: { 1: 2, 2: 2, 3: 1, }, commit_index: 2, applied_index: 2, storage: Status { name: "bitcask", keys: 4, size: 41, disk_size: 84, live_disk_size: 73, }, } ================================================ FILE: src/raft/testscripts/node/request_status_single ================================================ # Status requests return the cluster status on a single node. cluster nodes=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={} # Perform a write. (put 1 foo=bar) (stabilize) status --- n1@1 leader last=2@1 commit=2@1 applied=2 progress={} # Run a status request on the leader. status request=true 1 stabilize --- c1@1 → n1 ClientRequest id=0x02 status n1@1 → c1 ClientResponse id=0x02 status Status { leader: 1, term: 1, match_index: {1: 2}, commit_index: 2, applied_index: 2, storage: Status { name: "bitcask", keys: 4, size: 41, disk_size: 84, live_disk_size: 73 } } c1@1 status ⇒ Status { leader: 1, term: 1, match_index: { 1: 2, }, commit_index: 2, applied_index: 2, storage: Status { name: "bitcask", keys: 4, size: 41, disk_size: 84, live_disk_size: 73, }, } ================================================ FILE: src/raft/testscripts/node/restart ================================================ # Restarting a cluster that's fully caught up retains the existing state and # allows trivially electing a new leader. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Replicate a couple of writes. (put 1 a=1) (put 1 b=2) (stabilize heartbeat=true) --- ok # Dump the current status, log, and state. status --- n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4} n2@1 follower(n1) last=3@1 commit=3@1 applied=3 n3@1 follower(n1) last=3@1 commit=3@1 applied=3 log --- n1@1 term=1 last=3@1 commit=3@1 vote=Some(1) n1@1 entry 1@1 None n1@1 entry 2@1 put a=1 n1@1 entry 3@1 put b=2 n2@1 term=1 last=3@1 commit=3@1 vote=Some(1) n2@1 entry 1@1 None n2@1 entry 2@1 put a=1 n2@1 entry 3@1 put b=2 n3@1 term=1 last=3@1 commit=3@1 vote=Some(1) n3@1 entry 1@1 None n3@1 entry 2@1 put a=1 n3@1 entry 3@1 put b=2 state --- n1@1 applied=3 n1@1 state a=1 n1@1 state b=2 n2@1 applied=3 n2@1 state a=1 n2@1 state b=2 n3@1 applied=3 n3@1 state a=1 n3@1 state b=2 # Restart the nodes. They retain the same status, logs, and state. restart --- n1@1 follower() last=3@1 commit=3@1 applied=3 n2@1 follower() last=3@1 commit=3@1 applied=3 n3@1 follower() last=3@1 commit=3@1 applied=3 log --- n1@1 term=1 last=3@1 commit=3@1 vote=Some(1) n1@1 entry 1@1 None n1@1 entry 2@1 put a=1 n1@1 entry 3@1 put b=2 n2@1 term=1 last=3@1 commit=3@1 vote=Some(1) n2@1 entry 1@1 None n2@1 entry 2@1 put a=1 n2@1 entry 3@1 put b=2 n3@1 term=1 last=3@1 commit=3@1 vote=Some(1) n3@1 entry 1@1 None n3@1 entry 2@1 put a=1 n3@1 entry 3@1 put b=2 state --- n1@1 applied=3 n1@1 state a=1 n1@1 state b=2 n2@1 applied=3 n2@1 state a=1 n2@1 state b=2 n3@1 applied=3 n3@1 state a=1 n3@1 state b=2 # Elect a new leader. campaign 3 stabilize heartbeat=true --- n3@1 follower() ⇨ n3@2 candidate n3@2 → n1 Campaign last=3@1 n3@2 → n2 Campaign last=3@1 n1@1 follower() ⇨ n1@2 follower() n1@2 → n3 CampaignResponse vote=true n2@1 follower() ⇨ n2@2 follower() n2@2 → n3 CampaignResponse vote=true n3@2 candidate ⇨ n3@2 leader n3@2 append 4@2 None n3@2 → n1 Append base=3@1 [4@2] n3@2 → n2 Append base=3@1 [4@2] n3@2 → n1 Heartbeat last_index=4 commit_index=3 read_seq=0 n3@2 → n2 Heartbeat last_index=4 commit_index=3 read_seq=0 n1@2 follower() ⇨ n1@2 follower(n3) n1@2 append 4@2 None n1@2 → n3 AppendResponse match_index=4 n1@2 → n3 HeartbeatResponse match_index=4 read_seq=0 n2@2 follower() ⇨ n2@2 follower(n3) n2@2 append 4@2 None n2@2 → n3 AppendResponse match_index=4 n2@2 → n3 HeartbeatResponse match_index=4 read_seq=0 n3@2 commit 4@2 n3@2 → n1 Heartbeat last_index=4 commit_index=4 read_seq=0 n3@2 → n2 Heartbeat last_index=4 commit_index=4 read_seq=0 n1@2 commit 4@2 n1@2 → n3 HeartbeatResponse match_index=4 read_seq=0 n2@2 commit 4@2 n2@2 → n3 HeartbeatResponse match_index=4 read_seq=0 status --- n1@2 follower(n3) last=4@2 commit=4@2 applied=4 n2@2 follower(n3) last=4@2 commit=4@2 applied=4 n3@2 leader last=4@2 commit=4@2 applied=4 progress={1:4→5 2:4→5} ================================================ FILE: src/raft/testscripts/node/restart_apply ================================================ # Restarting a node and wiping its state machine will reapply the state. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Replicate a couple of writes. (put 1 a=1) (put 1 b=2) (stabilize heartbeat=true) --- ok # Restart n3 and clear its state machine. The node will apply all pending # entries when restarting. restart 3 applied_index=0 --- n3@1 apply 1@1 None n3@1 apply 2@1 put a=1 n3@1 apply 3@1 put b=2 n3@1 follower() last=3@1 commit=3@1 applied=3 state 3 --- n3@1 applied=3 n3@1 state a=1 n3@1 state b=2 # Restart n3 and lose the last write. It will also be reapplied. restart 3 applied_index=2 --- n3@1 apply 3@1 put b=2 n3@1 follower() last=3@1 commit=3@1 applied=3 state 3 --- n3@1 applied=3 n3@1 state a=1 n3@1 state b=2 ================================================ FILE: src/raft/testscripts/node/restart_commit_recover ================================================ # Restarting the cluster and wiping the commit indexes allows # a new leader to recover the commit index. cluster nodes=3 leader=1 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Replicate a couple of writes, but don't propagate the commit index. (put 1 a=1) (put 1 b=2) (stabilize) status --- n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4} n2@1 follower(n1) last=3@1 commit=1@1 applied=1 n3@1 follower(n1) last=3@1 commit=1@1 applied=1 # Restart all nodes and wipe the commit index. restart commit_index=0 --- n1@1 follower() last=3@1 commit=0@0 applied=3 n2@1 follower() last=3@1 commit=0@0 applied=1 n3@1 follower() last=3@1 commit=0@0 applied=1 # n3 campaigns for leadership and recovers the commit index. campaign 3 stabilize --- n3@1 follower() ⇨ n3@2 candidate n3@2 → n1 Campaign last=3@1 n3@2 → n2 Campaign last=3@1 n1@1 follower() ⇨ n1@2 follower() n1@2 → n3 CampaignResponse vote=true n2@1 follower() ⇨ n2@2 follower() n2@2 → n3 CampaignResponse vote=true n3@2 candidate ⇨ n3@2 leader n3@2 append 4@2 None n3@2 → n1 Append base=3@1 [4@2] n3@2 → n2 Append base=3@1 [4@2] n3@2 → n1 Heartbeat last_index=4 commit_index=0 read_seq=0 n3@2 → n2 Heartbeat last_index=4 commit_index=0 read_seq=0 n1@2 follower() ⇨ n1@2 follower(n3) n1@2 append 4@2 None n1@2 → n3 AppendResponse match_index=4 n1@2 → n3 HeartbeatResponse match_index=4 read_seq=0 n2@2 follower() ⇨ n2@2 follower(n3) n2@2 append 4@2 None n2@2 → n3 AppendResponse match_index=4 n2@2 → n3 HeartbeatResponse match_index=4 read_seq=0 n3@2 commit 4@2 status --- n1@2 follower(n3) last=4@2 commit=0@0 applied=3 n2@2 follower(n3) last=4@2 commit=0@0 applied=1 n3@2 leader last=4@2 commit=4@2 applied=4 progress={1:4→5 2:4→5} # A heartbeat propagates the commit index. heartbeat 3 stabilize --- n3@2 → n1 Heartbeat last_index=4 commit_index=4 read_seq=0 n3@2 → n2 Heartbeat last_index=4 commit_index=4 read_seq=0 n1@2 commit 4@2 n1@2 → n3 HeartbeatResponse match_index=4 read_seq=0 n2@2 commit 4@2 n2@2 → n3 HeartbeatResponse match_index=4 read_seq=0 ================================================ FILE: src/raft/testscripts/node/restart_term_vote ================================================ # The term/vote is retained across a restart. cluster nodes=3 --- n1@0 follower() last=0@0 commit=0@0 applied=0 n2@0 follower() last=0@0 commit=0@0 applied=0 n3@0 follower() last=0@0 commit=0@0 applied=0 # Start a new election on n1. campaign 1 --- n1@0 follower() ⇨ n1@1 candidate n1@1 → n2 Campaign last=0@0 n1@1 → n3 Campaign last=0@0 # n3 votes for n1, and then restarts. deliver 3 --- n3@0 follower() ⇨ n3@1 follower() n3@1 → n1 CampaignResponse vote=true restart 3 --- n3@1 follower() last=0@0 commit=0@0 applied=0 # n3 still has a record of the term and vote in the log. log 3 --- n3@1 term=1 last=0@0 commit=0@0 vote=Some(1) # n2 also campaigns. n3 does not grant its vote. campaign 2 --- n2@0 follower() ⇨ n2@1 candidate n2@1 → n1 Campaign last=0@0 n2@1 → n3 Campaign last=0@0 deliver 3 --- n3@1 → n2 CampaignResponse vote=false # n1 wins leadership. (stabilize) status --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=0@0 applied=0 n3@1 follower(n1) last=1@1 commit=0@0 applied=0 ================================================ FILE: src/raft/testscripts/node/tick_candidate ================================================ # Ticking a candidate will eventually hold a new election in a later term. cluster nodes=3 heartbeat_interval=1 election_timeout=2 --- n1@0 follower() last=0@0 commit=0@0 applied=0 n2@0 follower() last=0@0 commit=0@0 applied=0 n3@0 follower() last=0@0 commit=0@0 applied=0 # n1 campaigns. campaign 1 --- n1@0 follower() ⇨ n1@1 candidate n1@1 → n2 Campaign last=0@0 n1@1 → n3 Campaign last=0@0 # A single tick does nothing. tick 1 --- ok # Another tick campaigns in a later term. tick 1 --- n1@1 candidate ⇨ n1@2 candidate n1@2 → n2 Campaign last=0@0 n1@2 → n3 Campaign last=0@0 status --- n1@2 candidate last=0@0 commit=0@0 applied=0 n2@0 follower() last=0@0 commit=0@0 applied=0 n3@0 follower() last=0@0 commit=0@0 applied=0 ================================================ FILE: src/raft/testscripts/node/tick_follower ================================================ # Ticking a follower will transition it to candidate if it hasn't # heard from the leader in a while. cluster nodes=3 leader=1 heartbeat_interval=1 election_timeout=2 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # A single follower tick does nothing. tick 2 --- ok # If n1 heartbeats, the election counter is reset, and another n2 tick does nothing. heartbeat 1 stabilize --- n1@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0 n1@1 → n3 Heartbeat last_index=1 commit_index=1 read_seq=0 n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0 n3@1 → n1 HeartbeatResponse match_index=1 read_seq=0 tick 2 --- ok # Ticking n2 again exceeds the election timeout, making it campaign. tick 2 --- n2@1 follower(n1) ⇨ n2@2 candidate n2@2 → n1 Campaign last=1@1 n2@2 → n3 Campaign last=1@1 status --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@2 candidate last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 ================================================ FILE: src/raft/testscripts/node/tick_follower_leaderless ================================================ # Ticking a leaderless follower will eventually transition it to candidate. cluster nodes=3 heartbeat_interval=1 election_timeout=2 --- n1@0 follower() last=0@0 commit=0@0 applied=0 n2@0 follower() last=0@0 commit=0@0 applied=0 n3@0 follower() last=0@0 commit=0@0 applied=0 # A single follower tick does nothing. tick 2 --- ok # Another tick makes it campaign. tick 2 --- n2@0 follower() ⇨ n2@1 candidate n2@1 → n1 Campaign last=0@0 n2@1 → n3 Campaign last=0@0 status --- n1@0 follower() last=0@0 commit=0@0 applied=0 n2@1 candidate last=0@0 commit=0@0 applied=0 n3@0 follower() last=0@0 commit=0@0 applied=0 ================================================ FILE: src/raft/testscripts/node/tick_leader ================================================ # Ticking a leader should cause it to emit heartbeats, even when it doesn't # hear back from any followers. cluster nodes=3 leader=1 heartbeat_interval=1 election_timeout=2 --- n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2} n2@1 follower(n1) last=1@1 commit=1@1 applied=1 n3@1 follower(n1) last=1@1 commit=1@1 applied=1 # Ticking n1 will emit a heartbeat. tick 1 --- n1@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0 n1@1 → n3 Heartbeat last_index=1 commit_index=1 read_seq=0 # Ticking n1 again will emit further heartbeats, even when it hasn't heard from # any followers. tick 1 tick 1 tick 1 --- n1@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0 n1@1 → n3 Heartbeat last_index=1 commit_index=1 read_seq=0 n1@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0 n1@1 → n3 Heartbeat last_index=1 commit_index=1 read_seq=0 n1@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0 n1@1 → n3 Heartbeat last_index=1 commit_index=1 read_seq=0 ================================================ FILE: src/server.rs ================================================ use std::collections::HashMap; use std::io::{BufReader, BufWriter, Write as _}; use std::net::{TcpListener, TcpStream, ToSocketAddrs}; use std::time::Duration; use crossbeam::channel::{Receiver, Sender}; use log::{debug, error, info}; use serde::{Deserialize, Serialize}; use uuid::Uuid; use crate::encoding::{self, Value as _}; use crate::error::Result; use crate::raft; use crate::sql; use crate::sql::engine::{Catalog as _, Engine as _}; use crate::sql::execution::StatementResult; use crate::sql::types::{Row, Table}; use crate::storage; /// The outbound Raft peer channel capacity. This buffers messages when a Raft /// peer is slow or unavailable. Beyond this, messages will be dropped. const RAFT_PEER_CHANNEL_CAPACITY: usize = 1000; /// The retry interval when connecting to a Raft peer. const RAFT_PEER_RETRY_INTERVAL: Duration = Duration::from_secs(1); /// A toyDB server. Routes messages to/from an inner Raft node. /// /// * Listens for inbound SQL connections from clients via TCP and passes /// requests to the local Raft node. /// /// * Listens for inbound Raft connections from other toyDB nodes via TCP and /// passes messages to the local Raft node. /// /// * Connects to other toyDB nodes via TCP and sends outbound Raft messages /// from the local Raft node. pub struct Server { /// The inner Raft node. node: raft::Node, /// Outbound messages from the Raft node. node_rx: Receiver, /// Raft peer IDs and addresses. peers: HashMap, } impl Server { /// Creates a new toyDB server. pub fn new( id: raft::NodeID, peers: HashMap, raft_log: raft::Log, raft_state: Box, ) -> Result { let (node_tx, node_rx) = crossbeam::channel::unbounded(); let node = raft::Node::new( id, peers.keys().copied().collect(), raft_log, raft_state, node_tx, raft::Options::default(), )?; Ok(Self { node, peers, node_rx }) } /// Serves Raft and SQL requests indefinitely. Consumes the server. pub fn serve(self, raft_addr: impl ToSocketAddrs, sql_addr: impl ToSocketAddrs) -> Result<()> { let raft_listener = TcpListener::bind(raft_addr)?; let sql_listener = TcpListener::bind(sql_addr)?; info!( "Listening on {} (SQL) and {} (Raft)", sql_listener.local_addr()?, raft_listener.local_addr()? ); std::thread::scope(move |s| { let id = self.node.id(); let (raft_request_tx, raft_request_rx) = crossbeam::channel::unbounded(); let (raft_step_tx, raft_step_rx) = crossbeam::channel::unbounded(); // Serve inbound Raft connections. s.spawn(move || Self::raft_accept(raft_listener, raft_step_tx)); // Establish outbound Raft connections to peers. let mut raft_peers_tx = HashMap::new(); for (id, addr) in self.peers.into_iter() { let (raft_peer_tx, raft_peer_rx) = crossbeam::channel::bounded(RAFT_PEER_CHANNEL_CAPACITY); raft_peers_tx.insert(id, raft_peer_tx); s.spawn(move || Self::raft_send_peer(addr, raft_peer_rx)); } // Route Raft messages between the local node, peers, and clients. s.spawn(move || { Self::raft_route( self.node, self.node_rx, raft_step_rx, raft_peers_tx, raft_request_rx, ) }); // Serve inbound SQL connections. let sql_engine = sql::engine::Raft::new(raft_request_tx); s.spawn(move || Self::sql_accept(id, sql_listener, sql_engine)); }); Ok(()) } /// Accepts new inbound Raft connections from peers and spawns threads /// routing inbound messages to the local Raft node. fn raft_accept(listener: TcpListener, raft_step_tx: Sender) { std::thread::scope(|s| { loop { let (socket, peer) = match listener.accept() { Ok((socket, peer)) => (socket, peer), Err(err) => { error!("Raft peer accept failed: {err}"); continue; } }; let raft_step_tx = raft_step_tx.clone(); s.spawn(move || { debug!("Raft peer {peer} connected"); match Self::raft_receive_peer(socket, raft_step_tx) { Ok(()) => debug!("Raft peer {peer} disconnected"), Err(err) => error!("Raft peer {peer} error: {err}"), } }); } }); } /// Receives inbound messages from a peer via TCP, and queues them for /// stepping into the Raft node. fn raft_receive_peer(socket: TcpStream, raft_step_tx: Sender) -> Result<()> { let mut socket = BufReader::new(socket); while let Some(message) = raft::Envelope::maybe_decode_from(&mut socket)? { raft_step_tx.send(message)?; } Ok(()) } /// Sends outbound messages to a peer via TCP. Retries indefinitely if the /// connection fails. fn raft_send_peer(addr: String, raft_node_rx: Receiver) { loop { let mut socket = match TcpStream::connect(&addr) { Ok(socket) => BufWriter::new(socket), Err(err) => { error!("Failed connecting to Raft peer {addr}: {err}"); std::thread::sleep(RAFT_PEER_RETRY_INTERVAL); continue; } }; while let Ok(message) = raft_node_rx.recv() { if let Err(err) = message.encode_into(&mut socket).and_then(|_| Ok(socket.flush()?)) { error!("Failed sending to Raft peer {addr}: {err}"); break; } } debug!("Disconnected from Raft peer {addr}"); } } /// Routes Raft messages: /// /// * node_rx: outbound messages from the local Raft node. Routed to peers /// via TCP, or to local clients via a response channel. /// /// * request_rx: inbound requests from local SQL clients. Stepped into /// the local Raft node as ClientRequest messages. Responses are returned /// via the provided response channel. /// /// * peers_rx: inbound messages from remote Raft peers. Stepped into the /// local Raft node. /// /// * peers_tx: outbound per-peer channels sent via TCP connections. /// Messages from the local node's node_rx are sent here. /// /// Panics on any errors, since the Raft node can't recover from failed /// state transitions. fn raft_route( mut node: raft::Node, node_rx: Receiver, peers_rx: Receiver, mut peers_tx: HashMap>, request_rx: Receiver<(raft::Request, Sender>)>, ) { // Track response channels by request ID. The Raft node will emit // ClientResponse messages that we forward to the response channel. let mut response_txs = HashMap::>>::new(); let ticker = crossbeam::channel::tick(raft::TICK_INTERVAL); loop { crossbeam::select! { // Periodically tick the node. recv(ticker) -> _ => node = node.tick().expect("tick failed"), // Step messages from peers into the node. recv(peers_rx) -> result => { let msg = result.expect("peers_rx disconnected"); node = node.step(msg).expect("step failed"); }, // Send outbound messages from the node to the appropriate peer. // If we receive a client response addressed to the local node, // forward it to the waiting client via the response channel. recv(node_rx) -> result => { let msg = result.expect("node_rx disconnected"); if msg.to == node.id() && let raft::Message::ClientResponse{ id, response } = msg.message { if let Some(response_tx) = response_txs.remove(&id) { response_tx.send(response).expect("response_tx disconnected"); } continue } let peer_tx = peers_tx.get_mut(&msg.to).expect("unknown peer"); match peer_tx.try_send(msg) { Ok(()) => {}, Err(crossbeam::channel::TrySendError::Full(_)) => { error!("Raft peer channel full, dropping message"); }, Err(crossbeam::channel::TrySendError::Disconnected(_)) => { panic!("peer_tx disconnected"); }, }; } // Track inbound client requests and step them into the node. recv(request_rx) -> result => { let (request, response_tx) = result.expect("request_rx disconnected"); let id = Uuid::new_v4(); let msg = raft::Envelope{ from: node.id(), to: node.id(), term: node.term(), message: raft::Message::ClientRequest{id, request}, }; node = node.step(msg).expect("step failed"); response_txs.insert(id, response_tx); } } } } /// Accepts new SQL client connections and spawns session threads for them. fn sql_accept(id: raft::NodeID, listener: TcpListener, sql_engine: sql::engine::Raft) { std::thread::scope(|s| { loop { let (socket, peer) = match listener.accept() { Ok((socket, peer)) => (socket, peer), Err(err) => { error!("Client accept failed: {err}"); continue; } }; let session = sql_engine.session(); s.spawn(move || { debug!("Client {peer} connected"); match Self::sql_session(id, socket, session) { Ok(()) => debug!("Client {peer} disconnected"), Err(err) => error!("Client {peer} error: {err}"), } }); } }) } /// Processes a client SQL session, executing SQL statements against the /// Raft node. fn sql_session( id: raft::NodeID, socket: TcpStream, mut session: sql::execution::Session, ) -> Result<()> { let mut reader = BufReader::new(socket.try_clone()?); let mut writer = BufWriter::new(socket); while let Some(request) = Request::maybe_decode_from(&mut reader)? { // Execute request. debug!("Received request {request:?}"); let response = match request { Request::Execute(query) => session.execute(&query).map(Response::Execute), Request::GetTable(table) => { session.with_txn(true, |txn| txn.must_get_table(&table)).map(Response::GetTable) } Request::ListTables => session .with_txn(true, |txn| { Ok(txn.list_tables()?.into_iter().map(|t| t.name).collect()) }) .map(Response::ListTables), Request::Status => session .status() .map(|s| Status { server: id, raft: s.raft, mvcc: s.mvcc }) .map(Response::Status), }; // Process response. debug!("Returning response {response:?}"); response.encode_into(&mut writer)?; writer.flush()?; } Ok(()) } } /// A SQL client request. #[derive(Debug, Serialize, Deserialize)] pub enum Request { /// Executes a SQL statement. Execute(String), /// Fetches the given table schema. GetTable(String), /// Lists all tables. ListTables, /// Returns server status. Status, } impl encoding::Value for Request {} /// A SQL server response. #[derive(Debug, Serialize, Deserialize)] pub enum Response { Execute(StatementResult), Row(Option), GetTable(Table), ListTables(Vec), Status(Status), } impl encoding::Value for Response {} /// SQL server status. #[derive(Debug, PartialEq, Serialize, Deserialize)] pub struct Status { pub server: raft::NodeID, pub raft: raft::Status, pub mvcc: storage::mvcc::Status, } ================================================ FILE: src/sql/engine/engine.rs ================================================ use std::collections::{BTreeMap, BTreeSet}; use crate::errinput; use crate::error::Result; use crate::sql::execution::Session; use crate::sql::types::{Expression, Row, Rows, Table, Value}; use crate::storage::mvcc; /// A SQL engine. This provides low-level CRUD (create, read, update, delete) /// operations for table rows, a schema catalog for accessing and modifying /// table schemas, and interactive SQL sessions that execute client SQL /// statements. All engine access is transactional with snapshot isolation. pub trait Engine<'a>: Sized { /// The engine's transaction type. This provides both row-level CRUD operations and /// transactional access to the schema catalog. type Transaction: Transaction + 'a; /// Begins a read-write transaction. fn begin(&'a self) -> Result; /// Begins a read-only transaction. fn begin_read_only(&'a self) -> Result; /// Begins a read-only transaction as of a historical version. fn begin_as_of(&'a self, version: mvcc::Version) -> Result; /// Creates a client session for executing SQL statements. fn session(&'a self) -> Session<'a, Self> { Session::new(self) } } /// A SQL transaction. Executes transactional CRUD operations on table rows. /// Provides snapshot isolation (see `storage::mvcc` module for details). /// /// All methods operate on row batches rather than single rows to amortize the /// cost. With the Raft engine, each call results in a Raft roundtrip, and we'd /// rather not have to do that for every single row that's modified. pub trait Transaction: Catalog { /// The transaction's internal MVCC state. fn state(&self) -> &mvcc::TransactionState; /// Commits the transaction. fn commit(self) -> Result<()>; /// Rolls back the transaction. fn rollback(self) -> Result<()>; /// Deletes table rows by primary key, if they exist. fn delete(&self, table: &str, ids: &[Value]) -> Result<()>; /// Fetches table rows by primary key, if they exist. fn get(&self, table: &str, ids: &[Value]) -> Result>; /// Inserts new table rows. fn insert(&self, table: &str, rows: Vec) -> Result<()>; /// Looks up a set of primary keys by index values. BTreeSet for testing. fn lookup_index(&self, table: &str, column: &str, values: &[Value]) -> Result>; /// Scans a table's rows, optionally applying the given filter. fn scan(&self, table: &str, filter: Option) -> Result; /// Updates table rows by primary key. BTreeMap for testing. fn update(&self, table: &str, rows: BTreeMap) -> Result<()>; } /// The catalog stores table schema information. It must be implemented for /// Transaction, and is thus fully transactional. For simplicity, it only /// supports creating and dropping tables -- there are no ALTER TABLE schema /// changes, nor CREATE INDEX. pub trait Catalog { /// Creates a new table. Errors if it already exists. fn create_table(&self, table: Table) -> Result<()>; /// Drops a table. Errors if it does not exist, unless if_exists is true. /// Returns true if the table existed and was deleted. fn drop_table(&self, table: &str, if_exists: bool) -> Result; /// Fetches a table schema, or None if it doesn't exist. fn get_table(&self, table: &str) -> Result>; /// Returns a list of all table schemas. fn list_tables(&self) -> Result>; /// Fetches a table schema, or errors if it does not exist. fn must_get_table(&self, table: &str) -> Result
{ self.get_table(table)?.ok_or_else(|| errinput!("table {table} does not exist")) } } ================================================ FILE: src/sql/engine/local.rs ================================================ use std::borrow::Cow; use std::collections::{BTreeMap, BTreeSet}; use std::slice; use itertools::Itertools as _; use serde::{Deserialize, Serialize}; use super::Catalog; use crate::encoding::{self, Key as _, Value as _}; use crate::errinput; use crate::error::Result; use crate::sql::types::{Expression, Row, Rows, Table, Value}; use crate::storage::{self, mvcc}; /// SQL engine keys, using the Keycode order-preserving encoding. For /// simplicity, table and column names are used directly as identifiers, instead /// of e.g. numeric IDs. It is not possible to change table/column names, so /// this is fine, if somewhat inefficient. /// /// Uses Cow to allow encoding borrowed values but decoding owned values. #[derive(Debug, Deserialize, Serialize)] pub enum Key<'a> { /// A table schema, keyed by table name. The value is a `sql::types::Table`. Table(Cow<'a, str>), /// A column index entry, keyed by table name, column name, and index value. /// The value is a `BTreeSet` of `sql::types::Value` primary key values. Index(Cow<'a, str>, Cow<'a, str>, Cow<'a, Value>), /// A table row, keyed by table name and primary key value. The value is a /// `sql::types::Row`. Row(Cow<'a, str>, Cow<'a, Value>), } impl<'a> encoding::Key<'a> for Key<'a> {} /// Key prefixes, allowing prefix scans of specific parts of the keyspace. These /// must match the keys -- in particular, the enum variant indexes must match, /// since it's part of the encoded key. #[derive(Deserialize, Serialize)] enum KeyPrefix<'a> { /// All table schemas. Table, /// All column index entries, keyed by table and column name. Index(Cow<'a, str>, Cow<'a, str>), /// All table rows, keyed by table name. Row(Cow<'a, str>), } impl<'a> encoding::Key<'a> for KeyPrefix<'a> {} /// A SQL engine using local storage. This provides the main SQL storage logic. /// The Raft SQL engine dispatches to this for node-local SQL storage, executing /// the same writes across each nodes' instance of `Local`. pub struct Local { /// The local MVCC storage engine. pub mvcc: mvcc::MVCC, } impl Local { /// Creates a new local SQL engine using the given storage engine. pub fn new(engine: E) -> Self { Self { mvcc: mvcc::MVCC::new(engine) } } /// Resumes a transaction from the given state. This is usually kept within /// `mvcc::Transaction`, but the Raft-based engine can't retain the MVCC /// transaction across requests since it may be executed on different leader /// nodes, so it instead keeps the state client-side in the session. pub fn resume(&self, state: mvcc::TransactionState) -> Result> { Ok(Transaction::new(self.mvcc.resume(state)?)) } /// Gets an unversioned key, or None if it doesn't exist. pub fn get_unversioned(&self, key: &[u8]) -> Result>> { self.mvcc.get_unversioned(key) } /// Sets an unversioned key. pub fn set_unversioned(&self, key: &[u8], value: Vec) -> Result<()> { self.mvcc.set_unversioned(key, value) } } impl super::Engine<'_> for Local { type Transaction = Transaction; fn begin(&self) -> Result { Ok(Self::Transaction::new(self.mvcc.begin()?)) } fn begin_read_only(&self) -> Result { Ok(Self::Transaction::new(self.mvcc.begin_read_only()?)) } fn begin_as_of(&self, version: mvcc::Version) -> Result { Ok(Self::Transaction::new(self.mvcc.begin_as_of(version)?)) } } /// A SQL transaction, wrapping an MVCC transaction. pub struct Transaction { txn: mvcc::Transaction, } impl Transaction { /// Creates a new SQL transaction using the given MVCC transaction. fn new(txn: mvcc::Transaction) -> Self { Self { txn } } /// Returns the transaction's internal state. pub fn state(&self) -> &mvcc::TransactionState { self.txn.state() } /// Fetches the matching primary keys for the given secondary index value, /// or an empty set if there is none. fn get_index(&self, table: &str, column: &str, value: &Value) -> Result> { debug_assert!(self.has_index(table, column)?, "no index on {table}.{column}"); Ok(self .txn .get(&Key::Index(table.into(), column.into(), value.into()).encode())? .map(|v| BTreeSet::decode(&v)) .transpose()? .unwrap_or_default()) } /// Fetches a single row by primary key, or None if it doesn't exist. fn get_row(&self, table: &str, id: &Value) -> Result> { self.txn .get(&Key::Row(table.into(), id.into()).encode())? .map(|v| Row::decode(&v)) .transpose() } /// Returns true if a secondary index exists for the given column. fn has_index(&self, table: &str, column: &str) -> Result { let table = self.must_get_table(table)?; Ok(table.columns.iter().find(|c| c.name == column).map(|c| c.index).unwrap_or(false)) } /// Stores a secondary index entry for the given column value, replacing the /// existing entry if any. fn set_index( &self, table: &str, column: &str, value: &Value, ids: BTreeSet, ) -> Result<()> { debug_assert!(self.has_index(table, column)?, "no index on {table}.{column}"); let key = Key::Index(table.into(), column.into(), value.into()).encode(); if ids.is_empty() { self.txn.delete(&key)?; } else { self.txn.set(&key, ids.encode())?; } Ok(()) } /// Returns all tables referencing a table, as (table, column index) pairs. /// This includes any references from the table itself. fn table_references(&self, table: &str) -> Result)>> { Ok(self .list_tables()? .into_iter() .map(|t| { let references = t .columns .iter() .enumerate() .filter(|(_, c)| c.references.as_deref() == Some(table)) .map(|(i, _)| i) .collect_vec(); (t, references) }) .filter(|(_, references)| !references.is_empty()) .collect()) } } impl super::Transaction for Transaction { fn state(&self) -> &mvcc::TransactionState { self.txn.state() } fn commit(self) -> Result<()> { self.txn.commit() } fn rollback(self) -> Result<()> { self.txn.rollback() } fn delete(&self, table: &str, ids: &[Value]) -> Result<()> { let table = self.must_get_table(table)?; let indexes = table.columns.iter().enumerate().filter(|(_, c)| c.index).collect_vec(); // Check for foreign key references to the deleted rows. for (source, refs) in self.table_references(&table.name)? { let self_reference = source.name == table.name; for i in refs { let column = &source.columns[i]; let mut source_ids = if i == source.primary_key { // If the reference is from a primary key column, do a lookup. self.get(&source.name, ids)? .into_iter() .map(|row| row.into_iter().nth(i).expect("short row")) .collect() } else { // Otherwise (commonly), do a secondary index lookup. // All foreign keys have a secondary index. self.lookup_index(&source.name, &column.name, ids)? }; // We can ignore any references between the deleted rows, // including a row referencing itself. if self_reference { for id in ids { source_ids.remove(id); } } // Error if the delete would violate referential integrity. if let Some(source_id) = source_ids.first() { let table = source.name; let column = &source.columns[source.primary_key].name; return errinput!("row referenced by {table}.{column}={source_id}"); } } } for id in ids { // Update any secondary index entries. if !indexes.is_empty() && let Some(row) = self.get_row(&table.name, id)? { for (i, column) in indexes.iter().copied() { let mut ids = self.get_index(&table.name, &column.name, &row[i])?; ids.remove(id); self.set_index(&table.name, &column.name, &row[i], ids)?; } } // Delete the row. self.txn.delete(&Key::Row((&table.name).into(), id.into()).encode())?; } Ok(()) } fn get(&self, table: &str, ids: &[Value]) -> Result> { ids.iter().filter_map(|id| self.get_row(table, id).transpose()).collect() } fn insert(&self, table: &str, rows: Vec) -> Result<()> { let table = self.must_get_table(table)?; for row in rows { // Insert the row. table.validate_row(&row, false, self)?; let id = &row[table.primary_key]; self.txn.set(&Key::Row((&table.name).into(), id.into()).encode(), row.encode())?; // Update any secondary index entries. for (i, column) in table.columns.iter().enumerate().filter(|(_, c)| c.index) { let mut ids = self.get_index(&table.name, &column.name, &row[i])?; ids.insert(id.clone()); self.set_index(&table.name, &column.name, &row[i], ids)?; } } Ok(()) } fn lookup_index(&self, table: &str, column: &str, values: &[Value]) -> Result> { debug_assert!(self.has_index(table, column)?, "no index on {table}.{column}"); values.iter().map(|v| self.get_index(table, column, v)).flatten_ok().collect() } fn scan(&self, table: &str, filter: Option) -> Result { // TODO: this could be simpler if process_results() implemented Clone. let rows = self .txn .scan_prefix(&KeyPrefix::Row(table.into()).encode()) .map(|result| result.and_then(|(_, value)| Row::decode(&value))); let Some(filter) = filter else { return Ok(Box::new(rows)); }; let rows = rows.filter_map(move |result| { result .and_then(|row| match filter.evaluate(Some(&row))? { Value::Boolean(true) => Ok(Some(row)), Value::Boolean(false) | Value::Null => Ok(None), value => errinput!("filter returned {value}, expected boolean"), }) .transpose() }); Ok(Box::new(rows)) } fn update(&self, table: &str, rows: BTreeMap) -> Result<()> { let table = self.must_get_table(table)?; for (id, row) in rows { // If the primary key changes, we simply do a delete and insert. // This simplifies constraint validation. if id != row[table.primary_key] { self.delete(&table.name, &[id])?; self.insert(&table.name, vec![row])?; continue; } // Validate the row, but don't write it yet since we may need to // read the existing value to update secondary indexes. table.validate_row(&row, true, self)?; // Update indexes, knowing that the primary key has not changed. let indexes = table.columns.iter().enumerate().filter(|(_, c)| c.index).collect_vec(); if !indexes.is_empty() { let old = self.get(&table.name, slice::from_ref(&id))?.remove(0); for (i, column) in indexes { // If the value didn't change, we don't have to do anything. if old[i] == row[i] { continue; } // Remove the old value from the index entry. let mut ids = self.get_index(&table.name, &column.name, &old[i])?; ids.remove(&id); self.set_index(&table.name, &column.name, &old[i], ids)?; // Insert the new value into the index entry. let mut ids = self.get_index(&table.name, &column.name, &row[i])?; ids.insert(id.clone()); self.set_index(&table.name, &column.name, &row[i], ids)?; } } // Update the row. self.txn.set(&Key::Row((&table.name).into(), (&id).into()).encode(), row.encode())?; } Ok(()) } } impl Catalog for Transaction { fn create_table(&self, table: Table) -> Result<()> { if self.get_table(&table.name)?.is_some() { return errinput!("table {} already exists", table.name); } table.validate(self)?; self.txn.set(&Key::Table((&table.name).into()).encode(), table.encode()) } fn drop_table(&self, table: &str, if_exists: bool) -> Result { let Some(table) = self.get_table(table)? else { if if_exists { return Ok(false); } return errinput!("table {table} does not exist"); }; // Check for foreign key references. if let Some((source, refs)) = self.table_references(&table.name)?.iter().find(|(t, _)| t.name != table.name) { return errinput!( "table {} is referenced from {}.{}", table.name, source.name, source.columns[refs[0]].name ); } // Delete the table schema entry. self.txn.delete(&Key::Table((&table.name).into()).encode())?; // Delete the table rows. let prefix = &KeyPrefix::Row((&table.name).into()).encode(); let mut keys = self.txn.scan_prefix(prefix).map_ok(|(key, _)| key); while let Some(key) = keys.next().transpose()? { self.txn.delete(&key)?; } // Delete any secondary index entries. for column in table.columns.iter().filter(|c| c.index) { let prefix = &KeyPrefix::Index((&table.name).into(), (&column.name).into()).encode(); let mut keys = self.txn.scan_prefix(prefix).map_ok(|(key, _)| key); while let Some(key) = keys.next().transpose()? { self.txn.delete(&key)?; } } Ok(true) } fn get_table(&self, table: &str) -> Result> { self.txn.get(&Key::Table(table.into()).encode())?.map(|v| Table::decode(&v)).transpose() } fn list_tables(&self) -> Result> { self.txn .scan_prefix(&KeyPrefix::Table.encode()) .map(|r| r.and_then(|(_, v)| Table::decode(&v))) .collect() } } ================================================ FILE: src/sql/engine/mod.rs ================================================ //! The SQL engine provides SQL data storage and access, as well as session and //! transaction management. The `Local` engine provides node-local on-disk //! storage, while the `Raft` engine submits commands through Raft consensus //! before dispatching to the `Local` engine on each node. mod engine; mod local; mod raft; pub use engine::{Catalog, Engine, Transaction}; pub use local::{Key, Local}; pub use raft::{Raft, Status, Write}; ================================================ FILE: src/sql/engine/raft.rs ================================================ use std::borrow::Cow; use std::collections::{BTreeMap, BTreeSet}; use crossbeam::channel::Sender; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use super::{Catalog, Engine as _, Transaction as _}; use crate::encoding::{self, Value as _, bincode}; use crate::errdata; use crate::error::Result; use crate::raft; use crate::sql::types::{Expression, Row, Rows, Table, Value}; use crate::storage::{self, mvcc}; /// A read command, submitted via Raft and executed on the leader. Each command /// corresponds to a SQL engine method and parameters. Uses Cows to allow /// borrowed encoding and owned decoding. #[derive(Debug, Serialize, Deserialize)] pub enum Read<'a> { BeginReadOnly { as_of: Option, }, Status, Get { txn: Cow<'a, mvcc::TransactionState>, table: Cow<'a, str>, ids: Cow<'a, [Value]>, }, LookupIndex { txn: Cow<'a, mvcc::TransactionState>, table: Cow<'a, str>, column: Cow<'a, str>, values: Cow<'a, [Value]>, }, Scan { txn: Cow<'a, mvcc::TransactionState>, table: Cow<'a, str>, filter: Option, }, GetTable { txn: Cow<'a, mvcc::TransactionState>, table: Cow<'a, str>, }, ListTables { txn: Cow<'a, mvcc::TransactionState>, }, } impl encoding::Value for Read<'_> {} /// A write command, submitted via Raft and executed on all nodes. Each command /// corresponds to a SQL engine method and parameters. Uses Cows to allow /// borrowed encoding and owned decoding. #[derive(Debug, Serialize, Deserialize)] pub enum Write<'a> { Begin, Commit(Cow<'a, mvcc::TransactionState>), Rollback(Cow<'a, mvcc::TransactionState>), Delete { txn: Cow<'a, mvcc::TransactionState>, table: Cow<'a, str>, ids: Cow<'a, [Value]> }, Insert { txn: Cow<'a, mvcc::TransactionState>, table: Cow<'a, str>, rows: Vec }, Update { txn: Cow<'a, mvcc::TransactionState>, table: Cow<'a, str>, rows: BTreeMap }, CreateTable { txn: Cow<'a, mvcc::TransactionState>, schema: Table }, DropTable { txn: Cow<'a, mvcc::TransactionState>, table: Cow<'a, str>, if_exists: bool }, } impl encoding::Value for Write<'_> {} /// Raft SQL engine status. #[derive(Serialize, Deserialize)] pub struct Status { pub raft: raft::Status, pub mvcc: mvcc::Status, } /// A Raft-based SQL engine. This dispatches to the `Local` engine for local /// storage and processing on each node, but sends read and write commands /// through Raft for distributed consensus. /// /// The `Raft` engine itself is simply a Raft client which sends `raft::Request` /// to the local Raft node for processing. These requests are applied to the /// Raft SQL engine's `State` state machine running below Raft on each node, /// which executes the commands on a `Local` SQL engine using a /// `storage::Engine` for local storage. /// /// For more details on how SQL statements flow through the engine, see the /// `sql` module documentation. pub struct Raft { /// Sends requests to the local Raft node, along with a response channel. tx: Sender<(raft::Request, Sender>)>, } impl Raft { /// The unversioned key used to store the applied index. Just uses a string /// for simplicity. pub const APPLIED_INDEX_KEY: &'static [u8] = b"applied_index"; /// Creates a new Raft-based SQL engine, with a channel to send requests to /// the local Raft node. pub fn new(tx: Sender<(raft::Request, Sender>)>) -> Self { Self { tx } } /// Creates the Raft-managed state machine for the Raft engine. Receives /// commands from the Raft engine and executes them on a `Local` engine. pub fn new_state(engine: E) -> Result> { State::new(engine) } /// Executes a request against the Raft cluster, waiting for the response. fn request(&self, request: raft::Request) -> Result { let (response_tx, response_rx) = crossbeam::channel::bounded(1); self.tx.send((request, response_tx))?; response_rx.recv()? } /// Writes through Raft, deserializing the response into the return type. fn write(&self, write: Write) -> Result { match self.request(raft::Request::Write(write.encode()))? { raft::Response::Write(response) => bincode::deserialize(&response), response => errdata!("unexpected Raft write response {response:?}"), } } /// Reads from Raft, deserializing the response into the return type. fn read(&self, read: Read) -> Result { match self.request(raft::Request::Read(read.encode()))? { raft::Response::Read(response) => bincode::deserialize(&response), response => errdata!("unexpected Raft read response {response:?}"), } } /// Raft SQL engine status. pub fn status(&self) -> Result { let raft = match self.request(raft::Request::Status)? { raft::Response::Status(status) => status, response => return errdata!("unexpected Raft status response {response:?}"), }; let mvcc = self.read(Read::Status)?; Ok(Status { raft, mvcc }) } } impl<'a> super::Engine<'a> for Raft { type Transaction = Transaction<'a>; fn begin(&'a self) -> Result { Transaction::begin(self, false, None) } fn begin_read_only(&'a self) -> Result { Transaction::begin(self, true, None) } fn begin_as_of(&'a self, version: mvcc::Version) -> Result { Transaction::begin(self, true, Some(version)) } } /// A Raft SQL engine transaction. /// /// This keeps track of the transaction state in memory. An `mvcc::Transaction` /// normally manages this, but since `mvcc::Transaction` runs below Raft, it /// can't maintain this state between individual requests (which could execute /// on different leaders). Instead, it uses `mvcc::Transaction::resume` to /// resume the transaction from the provided transaction state for each request. pub struct Transaction<'a> { /// The Raft SQL engine client, used to communicate with Raft. raft: &'a Raft, /// The MVCC transaction state. state: mvcc::TransactionState, } impl<'a> Transaction<'a> { /// Starts a transaction in the given mode. fn begin(raft: &'a Raft, read_only: bool, as_of: Option) -> Result { assert!(as_of.is_none() || read_only, "can't use as_of without read_only"); // Read-only transactions don't allocate a new MVCC version, so they // don't write anything -- they just grab the current transaction state. // Submit them as reads to avoid a replication roundtrip. let state = if read_only || as_of.is_some() { raft.read(Read::BeginReadOnly { as_of })? } else { raft.write(Write::Begin)? }; Ok(Self { raft, state }) } } impl super::Transaction for Transaction<'_> { fn state(&self) -> &mvcc::TransactionState { &self.state } fn commit(self) -> Result<()> { if self.state.read_only { return Ok(()); // noop } self.raft.write(Write::Commit(self.state.into())) } fn rollback(self) -> Result<()> { if self.state.read_only { return Ok(()); // noop } self.raft.write(Write::Rollback(self.state.into())) } fn delete(&self, table: &str, ids: &[Value]) -> Result<()> { self.raft.write(Write::Delete { txn: (&self.state).into(), table: table.into(), ids: ids.into(), }) } fn get(&self, table: &str, ids: &[Value]) -> Result> { self.raft.read(Read::Get { txn: (&self.state).into(), table: table.into(), ids: ids.into(), }) } fn insert(&self, table: &str, rows: Vec) -> Result<()> { self.raft.write(Write::Insert { txn: (&self.state).into(), table: table.into(), rows }) } fn lookup_index(&self, table: &str, column: &str, values: &[Value]) -> Result> { self.raft.read(Read::LookupIndex { txn: (&self.state).into(), table: table.into(), column: column.into(), values: values.into(), }) } fn scan(&self, table: &str, filter: Option) -> Result { let scan: Vec = self.raft.read(Read::Scan { txn: (&self.state).into(), table: table.into(), filter, })?; Ok(Box::new(scan.into_iter().map(Ok))) } fn update(&self, table: &str, rows: BTreeMap) -> Result<()> { self.raft.write(Write::Update { txn: (&self.state).into(), table: table.into(), rows }) } } impl Catalog for Transaction<'_> { fn create_table(&self, schema: Table) -> Result<()> { self.raft.write(Write::CreateTable { txn: (&self.state).into(), schema }) } fn drop_table(&self, table: &str, if_exists: bool) -> Result { self.raft.write(Write::DropTable { txn: (&self.state).into(), table: table.into(), if_exists, }) } fn get_table(&self, table: &str) -> Result> { self.raft.read(Read::GetTable { txn: (&self.state).into(), table: table.into() }) } fn list_tables(&self) -> Result> { self.raft.read(Read::ListTables { txn: (&self.state).into() }) } } /// The state machine for the Raft SQL engine. Receives commands via Raft and /// dispatches to a `Local` SQL engine which does the actual work, using a /// `storage::Engine` for storage. /// /// For simplicity, we don't attempt to stream large requests or responses, /// instead just delivering them as one large chunk. This means that e.g. a full /// table scan will pull the entire table into memory, serialize it, and send it /// across the network as one message, but that's fine for toyDB. pub struct State { /// The local SQL engine, used for actual storage. local: super::Local, /// The last applied index. This tells Raft which command to apply next. applied_index: raft::Index, } impl State { /// Creates a new Raft state maching using the given storage engine for /// local storage. pub fn new(engine: E) -> Result { let local = super::Local::new(engine); let applied_index = local .get_unversioned(Raft::APPLIED_INDEX_KEY)? .map(|b| bincode::deserialize(&b)) .transpose()? .unwrap_or_default(); Ok(State { local, applied_index }) } /// Executes a write command. This is executed on all nodes, but the /// response is returned from the Raft leader. /// /// The response is encoded using Bincode. The caller will know what /// response type to expect for each command and deserialize into it. fn write(&self, command: Write) -> Result> { Ok(match command { Write::Begin => self.local.begin()?.state().encode(), Write::Commit(txn) => { bincode::serialize(&self.local.resume(txn.into_owned())?.commit()?) } Write::Rollback(txn) => { bincode::serialize(&self.local.resume(txn.into_owned())?.rollback()?) } Write::Delete { txn, table, ids } => { bincode::serialize(&self.local.resume(txn.into_owned())?.delete(&table, &ids)?) } Write::Insert { txn, table, rows } => { bincode::serialize(&self.local.resume(txn.into_owned())?.insert(&table, rows)?) } Write::Update { txn, table, rows } => { bincode::serialize(&self.local.resume(txn.into_owned())?.update(&table, rows)?) } Write::CreateTable { txn, schema } => { bincode::serialize(&self.local.resume(txn.into_owned())?.create_table(schema)?) } Write::DropTable { txn, table, if_exists } => bincode::serialize( &self.local.resume(txn.into_owned())?.drop_table(&table, if_exists)?, ), }) } } impl raft::State for State { fn get_applied_index(&self) -> raft::Index { self.applied_index } fn apply(&mut self, entry: raft::Entry) -> Result> { assert_eq!(entry.index, self.applied_index + 1, "entry index not after applied index"); let result = match &entry.command { Some(command) => match self.write(Write::decode(command)?) { // Panic on non-deterministic apply failures, to prevent node // state divergence. See `raft::State` docs for details. Err(e) if !e.is_deterministic() => panic!("non-deterministic apply failure: {e}"), result => result, }, // Raft submits noop commands on leader changes. Ignore them, but // record the applied index below. None => Ok(Vec::new()), }; // Persist the applied index. We don't have to flush, because it's ok to // lose a tail of the state machine writes (e.g. if the machine // crashes). Raft will replay the log from the last known applied index. self.applied_index = entry.index; self.local.set_unversioned(Raft::APPLIED_INDEX_KEY, bincode::serialize(&entry.index))?; result } fn read(&self, command: Vec) -> Result> { Ok(match Read::decode(&command)? { Read::BeginReadOnly { as_of } => { let txn = match as_of { Some(version) => self.local.begin_as_of(version)?, None => self.local.begin_read_only()?, }; txn.state().encode() } Read::Status => self.local.mvcc.status()?.encode(), Read::Get { txn, table, ids } => { self.local.resume(txn.into_owned())?.get(&table, &ids)?.encode() } Read::LookupIndex { txn, table, column, values } => self .local .resume(txn.into_owned())? .lookup_index(&table, &column, &values)? .encode(), Read::Scan { txn, table, filter } => { // For simplicity, buffer the entire scan. See `State` comment. self.local .resume(txn.into_owned())? .scan(&table, filter)? .collect::>>()? .encode() } Read::GetTable { txn, table } => { self.local.resume(txn.into_owned())?.get_table(&table)?.encode() } Read::ListTables { txn } => { self.local.resume(txn.into_owned())?.list_tables()?.encode() } }) } } ================================================ FILE: src/sql/execution/aggregator.rs ================================================ use std::collections::BTreeMap; use itertools::Itertools as _; use crate::error::Result; use crate::sql::planner::Aggregate; use crate::sql::types::{Expression, Row, Rows, Value}; /// Computes bucketed aggregates for input rows. For example, this query would /// compute COUNT and SUM aggregates bucketed by category and brand: /// /// SELECT COUNT(*), SUM(price) FROM products GROUP BY category, brand pub struct Aggregator { /// GROUP BY expressions. group_by: Vec, /// Aggregates to compute. aggregates: Vec, /// Accumulators indexed by group_by bucket. buckets: BTreeMap, Vec>, } impl Aggregator { /// Creates a new aggregator for the given GROUP BY buckets and aggregates. pub fn new(group_by: Vec, aggregates: Vec) -> Self { Self { group_by, aggregates, buckets: BTreeMap::new() } } /// Adds a row to the aggregator. pub fn add(&mut self, row: &Row) -> Result<()> { // Compute the bucket values. let bucket = self.group_by.iter().map(|expr| expr.evaluate(Some(row))).try_collect()?; // Look up the bucket accumulators, or create a new bucket. let accumulators = self .buckets .entry(bucket) .or_insert_with(|| self.aggregates.iter().map(Accumulator::new).collect()) .iter_mut(); // Collect expressions to evaluate. let exprs = self.aggregates.iter().map(|a| a.expr()); // Accumulate the evaluated values. for (accumulator, expr) in accumulators.zip_eq(exprs) { accumulator.add(expr.evaluate(Some(row))?)?; } Ok(()) } /// Adds rows to the aggregator. pub fn add_rows(&mut self, rows: Rows) -> Result<()> { for row in rows { self.add(&row?)?; } Ok(()) } /// Returns a row iterator over the aggregate result. pub fn into_rows(self) -> Rows { // If there were no rows and no group_by expressions, return a row of // empty accumulators (e.g. SELECT COUNT(*) FROM t WHERE FALSE). if self.buckets.is_empty() && self.group_by.is_empty() { let result = self.aggregates.iter().map(Accumulator::new).map(|acc| acc.value()).try_collect(); return Box::new(std::iter::once(result)); } // Emit the group_by and aggregate values for each bucket. We use an // intermediate vec since btree_map::IntoIter doesn't implement Clone // (required by Rows). let buckets = self.buckets.into_iter().collect_vec(); Box::new(buckets.into_iter().map(|(bucket, accumulators)| { bucket .into_iter() .map(Ok) .chain(accumulators.into_iter().map(|acc| acc.value())) .collect() })) } } /// Accumulates aggregate values. Uses an enum rather than a trait since we need /// to keep these in a vector (could use boxed trait objects too). #[derive(Clone)] enum Accumulator { Average { count: i64, sum: Value }, Count(i64), Max(Option), Min(Option), Sum(Option), } impl Accumulator { /// Creates a new accumulator from an aggregate kind. fn new(aggregate: &Aggregate) -> Self { match aggregate { Aggregate::Average(_) => Self::Average { count: 0, sum: Value::Integer(0) }, Aggregate::Count(_) => Self::Count(0), Aggregate::Max(_) => Self::Max(None), Aggregate::Min(_) => Self::Min(None), Aggregate::Sum(_) => Self::Sum(None), } } /// Adds a value to the accumulator. fn add(&mut self, value: Value) -> Result<()> { // Aggregates ignore NULL values. if value == Value::Null { return Ok(()); } match self { Self::Average { sum, count } => (*sum, *count) = (sum.checked_add(&value)?, *count + 1), Self::Count(count) => *count += 1, Self::Max(max @ None) => *max = Some(value), Self::Max(Some(max)) if value > *max => *max = value, Self::Max(Some(_)) => {} Self::Min(min @ None) => *min = Some(value), Self::Min(Some(min)) if value < *min => *min = value, Self::Min(Some(_)) => {} Self::Sum(sum @ None) => *sum = Some(Value::Integer(0).checked_add(&value)?), Self::Sum(Some(sum)) => *sum = sum.checked_add(&value)?, } Ok(()) } /// Returns the aggregate value. fn value(self) -> Result { Ok(match self { Self::Average { count: 0, sum: _ } => Value::Null, Self::Average { count, sum } => sum.checked_div(&Value::Integer(count))?, Self::Count(count) => count.into(), Self::Max(Some(value)) | Self::Min(Some(value)) | Self::Sum(Some(value)) => value, Self::Max(None) | Self::Min(None) | Self::Sum(None) => Value::Null, }) } } ================================================ FILE: src/sql/execution/executor.rs ================================================ use std::cmp::Ordering; use std::collections::{BTreeMap, HashMap}; use itertools::{Itertools as _, izip}; use super::aggregator::Aggregator; use super::join::{HashJoiner, NestedLoopJoiner}; use crate::errinput; use crate::error::Result; use crate::sql::engine::Transaction; use crate::sql::planner::{Direction, Node, Plan}; use crate::sql::types::{Expression, Label, Row, Rows, Table, Value}; /// Executes statement plans. /// /// The plan root specifies the action to take (e.g. SELECT, INSERT, UPDATE, /// etc). It has a nested tree of child nodes that process rows. /// /// Nodes are executed recursively, and return row iterators. Parent nodes /// recursively pull input rows from their child nodes, process them, and pass /// them on to their parent node. /// /// Below is an example of an (unoptimized) query plan: /// /// SELECT title, released, genres.name AS genre /// FROM movies INNER JOIN genres ON movies.genre_id = genres.id /// WHERE released >= 2000 /// ORDER BY released /// /// Select /// └─ Order: movies.released desc /// └─ Projection: movies.title, movies.released, genres.name as genre /// └─ Filter: movies.released >= 2000 /// └─ NestedLoopJoin: inner on movies.genre_id = genres.id /// ├─ Scan: movies /// └─ Scan: genres /// /// Rows flow from the tree leaves to the root: /// /// 1. Scan nodes read rows from movies and genres. /// 2. NestedLoopJoin joins the rows from movies and genres. /// 3. Filter discards rows with release dates older than 2000. /// 4. Projection picks out the requested column values from the rows. /// 5. Order sorts the rows by release date. /// 6. Select returns the final rows to the client. pub struct Executor<'a, T: Transaction> { /// The transaction used to execute the plan. txn: &'a T, } impl<'a, T: Transaction> Executor<'a, T> { /// Creates a new executor. pub fn new(txn: &'a T) -> Self { Self { txn } } /// Executes a plan, returning an execution result. pub fn execute(&mut self, plan: Plan) -> Result { Ok(match plan { // CREATE TABLE Plan::CreateTable { schema } => { let name = schema.name.clone(); self.txn.create_table(schema)?; ExecutionResult::CreateTable { name } } // DROP TABLE Plan::DropTable { name, if_exists } => { let existed = self.txn.drop_table(&name, if_exists)?; ExecutionResult::DropTable { name, existed } } // DELETE Plan::Delete { table, primary_key, source } => { let source = self.execute_node(source)?; let count = self.delete(&table, primary_key, source)?; ExecutionResult::Delete { count } } // INSERT Plan::Insert { table, column_map, source } => { let source = self.execute_node(source)?; let count = self.insert(table, column_map, source)?; ExecutionResult::Insert { count } } // SELECT Plan::Select(root) => { let columns = (0..root.columns()).map(|i| root.column_label(i)).collect(); let rows = self.execute_node(root)?; ExecutionResult::Select { columns, rows } } // UPDATE Plan::Update { table, primary_key, source, expressions } => { let source = self.execute_node(source)?; let count = self.update(&table.name, primary_key, source, expressions)?; ExecutionResult::Update { count } } }) } /// Recursively executes a query plan node, returning a row iterator. fn execute_node(&mut self, node: Node) -> Result { Ok(match node { // GROUP BY and aggregate functions. Node::Aggregate { source, group_by, aggregates } => { let source = self.execute_node(*source)?; let mut aggregator = Aggregator::new(group_by, aggregates); aggregator.add_rows(source)?; aggregator.into_rows() } // WHERE and similar filtering. Node::Filter { source, predicate } => { let source = self.execute_node(*source)?; Box::new(source.filter_map(move |result| { result .and_then(|row| match predicate.evaluate(Some(&row))? { Value::Boolean(true) => Ok(Some(row)), Value::Boolean(false) | Value::Null => Ok(None), value => errinput!("filter returned {value}, expected boolean",), }) .transpose() })) } // JOIN using a hash join. Node::HashJoin { left, left_column, right, right_column, outer } => { let right_columns = right.columns(); let left = self.execute_node(*left)?; let right = self.execute_node(*right)?; Box::new(HashJoiner::new( left, left_column, right, right_column, right_columns, outer, )?) } // Looks up primary keys by secondary index values. Node::IndexLookup { table, column, values, alias: _ } => { let column = table.columns.into_iter().nth(column).expect("invalid column").name; let ids = self.txn.lookup_index(&table.name, &column, &values)?.into_iter().collect_vec(); Box::new(self.txn.get(&table.name, &ids)?.into_iter().map(Ok)) } // Looks up rows by primary key. Node::KeyLookup { table, keys, alias: _ } => { Box::new(self.txn.get(&table.name, &keys)?.into_iter().map(Ok)) } // LIMIT Node::Limit { source, limit } => Box::new(self.execute_node(*source)?.take(limit)), // JOIN using a nested loop join. Node::NestedLoopJoin { left, right, predicate, outer } => { let right_columns = right.columns(); let left = self.execute_node(*left)?; let right = self.execute_node(*right)?; Box::new(NestedLoopJoiner::new(left, right, right_columns, predicate, outer)) } // An empty row iterator. Node::Nothing { .. } => Box::new(std::iter::empty()), // OFFSET Node::Offset { source, offset } => Box::new(self.execute_node(*source)?.skip(offset)), // ORDER BY Node::Order { source, key } => { let source = self.execute_node(*source)?; Box::new(Self::order(source, key)?) } // Projects columns from the source, and evaluates expressions. Node::Projection { source, expressions, aliases: _ } => { let source = self.execute_node(*source)?; Box::new(source.map(move |result| { let row = result?; expressions.iter().map(|expr| expr.evaluate(Some(&row))).collect() })) } // Remaps source column indexes to new target column indexes. Node::Remap { source, targets } => { let source = self.execute_node(*source)?; let size = targets.iter().copied().flatten().map(|i| i + 1).max().unwrap_or(0); Box::new(source.map_ok(move |row| { let mut remapped = vec![Value::Null; size]; for (target, value) in targets.iter().copied().zip_eq(row) { if let Some(target) = target { remapped[target] = value; } } remapped })) } // Scans a table, optionally filtering rows. Node::Scan { table, filter, alias: _ } => Box::new(self.txn.scan(&table.name, filter)?), // Emits constant values. Node::Values { rows } => Box::new( rows.into_iter() .map(|row| row.into_iter().map(|expr| expr.evaluate(None)).collect()), ), }) } /// DELETE: deletes rows, taking primary keys from the source at the given /// primary_key column index. Returns the number of rows deleted. fn delete(&self, table: &str, primary_key: usize, source: Rows) -> Result { let ids: Vec = source .map_ok(|row| row.into_iter().nth(primary_key).expect("short row")) .try_collect()?; let count = ids.len() as u64; self.txn.delete(table, &ids)?; Ok(count) } /// INSERT: inserts rows into a table from the given source. /// /// If given, column_map contains the mapping of table → source columns for /// all columns in source. Otherwise, every column in source corresponds to /// those in table, but a tail of source columns may be missing. fn insert( &self, table: Table, column_map: Option>, mut source: Rows, ) -> Result { let mut rows = Vec::new(); while let Some(values) = source.next().transpose()? { // Fast path: the row is already complete, with no column mapping. if values.len() == table.columns.len() && column_map.is_none() { rows.push(values); continue; } if values.len() > table.columns.len() { return errinput!("too many values for table {}", table.name); } if let Some(column_map) = &column_map && column_map.len() != values.len() { return errinput!("column and value counts do not match"); } // Map source columns to table columns, and fill in default values. let mut row = Vec::with_capacity(table.columns.len()); for (i, column) in table.columns.iter().enumerate() { if column_map.is_none() && i < values.len() { // Pass through the source column to the table column. row.push(values[i].clone()) } else if let Some(vi) = column_map.as_ref().and_then(|c| c.get(&i)).copied() { // Map the source column to the table column. row.push(values[vi].clone()) } else if let Some(default) = &column.default { // Column not given in source, use the default. row.push(default.clone()) } else { return errinput!("no value given for column {} with no default", column.name); } } rows.push(row); } let count = rows.len() as u64; self.txn.insert(&table.name, rows)?; Ok(count) } /// UPDATE: updates rows passed in from the source. Returns the number of /// rows updated. fn update( &self, table: &str, primary_key: usize, mut source: Rows, expressions: Vec<(usize, Expression)>, ) -> Result { let mut updates = BTreeMap::new(); while let Some(row) = source.next().transpose()? { let mut update = row.clone(); for (column, expr) in &expressions { update[*column] = expr.evaluate(Some(&row))?; } let id = row.into_iter().nth(primary_key).expect("short row"); updates.insert(id, update); } let count = updates.len() as u64; self.txn.update(table, updates)?; Ok(count) } /// Sorts the input rows. fn order(source: Rows, order: Vec<(Expression, Direction)>) -> Result { // We can't use sorted_by_cached_key(), since expression evaluation is // fallible, and since we may have to vary the sort direction of each // expression. Collect the rows and pre-computed sort keys into a vec. let mut rows: Vec<(Row, Vec)> = source .map(|result| { result.and_then(|row| { let sort_keys = order.iter().map(|(expr, _)| expr.evaluate(Some(&row))).try_collect()?; Ok((row, sort_keys)) }) }) .try_collect()?; rows.sort_by(|(_, a_keys), (_, b_keys)| { let dirs = order.iter().map(|(_, dir)| dir).copied(); for (a_key, b_key, dir) in izip!(a_keys, b_keys, dirs) { let mut ordering = a_key.cmp(b_key); if dir == Direction::Descending { ordering = ordering.reverse(); } if ordering != Ordering::Equal { return ordering; } } Ordering::Equal }); Ok(Box::new(rows.into_iter().map(|(row, _)| Ok(row)))) } } /// A plan execution result. pub enum ExecutionResult { CreateTable { name: String }, DropTable { name: String, existed: bool }, Delete { count: u64 }, Insert { count: u64 }, Update { count: u64 }, Select { columns: Vec