Showing preview only (1,523K chars total). Download the full file or copy to clipboard to get everything.
Repository: erikgrinaker/toydb
Branch: main
Commit: 473afbdb4aea
Files: 284
Total size: 1.4 MB
Directory structure:
gitextract_nc06cv1f/
├── .github/
│ └── workflows/
│ └── ci.yml
├── .gitignore
├── Cargo.toml
├── LICENSE
├── README.md
├── cluster/
│ ├── run.sh
│ ├── toydb1/
│ │ └── toydb.yaml
│ ├── toydb2/
│ │ └── toydb.yaml
│ ├── toydb3/
│ │ └── toydb.yaml
│ ├── toydb4/
│ │ └── toydb.yaml
│ └── toydb5/
│ └── toydb.yaml
├── config/
│ └── toydb.yaml
├── docs/
│ ├── architecture/
│ │ ├── README.md
│ │ ├── client.md
│ │ ├── encoding.md
│ │ ├── index.md
│ │ ├── mvcc.md
│ │ ├── overview.md
│ │ ├── raft.md
│ │ ├── server.md
│ │ ├── sql-data.md
│ │ ├── sql-execution.md
│ │ ├── sql-optimizer.md
│ │ ├── sql-parser.md
│ │ ├── sql-planner.md
│ │ ├── sql-raft.md
│ │ ├── sql-storage.md
│ │ ├── sql.md
│ │ └── storage.md
│ ├── architecture.md
│ ├── crate/
│ │ ├── Cargo.toml
│ │ ├── README.md
│ │ └── src/
│ │ └── lib.rs
│ ├── examples.md
│ ├── references.md
│ ├── sql.md
│ └── tools/
│ └── update-links.py
├── rust-toolchain
├── rustfmt.toml
├── src/
│ ├── bin/
│ │ ├── toydb.rs
│ │ ├── toydump.rs
│ │ ├── toysql.rs
│ │ └── workload.rs
│ ├── client.rs
│ ├── encoding/
│ │ ├── bincode.rs
│ │ ├── format.rs
│ │ ├── keycode.rs
│ │ └── mod.rs
│ ├── error.rs
│ ├── lib.rs
│ ├── raft/
│ │ ├── log.rs
│ │ ├── message.rs
│ │ ├── mod.rs
│ │ ├── node.rs
│ │ ├── state.rs
│ │ └── testscripts/
│ │ ├── log/
│ │ │ ├── append
│ │ │ ├── commit
│ │ │ ├── get
│ │ │ ├── has
│ │ │ ├── init
│ │ │ ├── scan
│ │ │ ├── scan_apply
│ │ │ ├── splice
│ │ │ ├── status
│ │ │ └── term
│ │ └── node/
│ │ ├── append
│ │ ├── append_base_missing
│ │ ├── append_base_missing_all
│ │ ├── append_commit_quorum
│ │ ├── append_initial
│ │ ├── append_max_entries
│ │ ├── append_pipeline
│ │ ├── append_probe_divergent_first
│ │ ├── append_probe_divergent_long
│ │ ├── append_probe_divergent_short
│ │ ├── append_probe_divergent_single
│ │ ├── append_response_beyond_last_index_panics
│ │ ├── append_response_stale_reject
│ │ ├── election
│ │ ├── election_candidate_behind_leader
│ │ ├── election_candidate_behind_quorum
│ │ ├── election_contested
│ │ ├── election_tie
│ │ ├── election_tie_even
│ │ ├── heartbeat_commits_follower
│ │ ├── heartbeat_converts_candidate
│ │ ├── heartbeat_converts_follower
│ │ ├── heartbeat_converts_follower_leaderless
│ │ ├── heartbeat_converts_leader
│ │ ├── heartbeat_lost_append_duplicate
│ │ ├── heartbeat_lost_append_multiple
│ │ ├── heartbeat_lost_append_single
│ │ ├── heartbeat_lost_read
│ │ ├── heartbeat_match_commits
│ │ ├── heartbeat_multiple_leaders_panic
│ │ ├── heartbeat_old_commit_index
│ │ ├── heartbeat_old_last_index
│ │ ├── heartbeat_probe_divergent
│ │ ├── old_campaign_rejected
│ │ ├── old_campaign_response_ignored
│ │ ├── old_heartbeat_ignored
│ │ ├── request_candidate_abort
│ │ ├── request_follower
│ │ ├── request_follower_campaign_abort
│ │ ├── request_follower_disconnect_stall
│ │ ├── request_follower_leaderless_abort
│ │ ├── request_leader
│ │ ├── request_leader_campaign_abort
│ │ ├── request_leader_change_linearizability
│ │ ├── request_leader_disconnect
│ │ ├── request_leader_read_quorum
│ │ ├── request_leader_read_quorum_sequence
│ │ ├── request_leader_single
│ │ ├── request_status
│ │ ├── request_status_single
│ │ ├── restart
│ │ ├── restart_apply
│ │ ├── restart_commit_recover
│ │ ├── restart_term_vote
│ │ ├── tick_candidate
│ │ ├── tick_follower
│ │ ├── tick_follower_leaderless
│ │ └── tick_leader
│ ├── server.rs
│ ├── sql/
│ │ ├── engine/
│ │ │ ├── engine.rs
│ │ │ ├── local.rs
│ │ │ ├── mod.rs
│ │ │ └── raft.rs
│ │ ├── execution/
│ │ │ ├── aggregator.rs
│ │ │ ├── executor.rs
│ │ │ ├── join.rs
│ │ │ ├── mod.rs
│ │ │ └── session.rs
│ │ ├── mod.rs
│ │ ├── parser/
│ │ │ ├── ast.rs
│ │ │ ├── lexer.rs
│ │ │ ├── mod.rs
│ │ │ └── parser.rs
│ │ ├── planner/
│ │ │ ├── mod.rs
│ │ │ ├── optimizer.rs
│ │ │ ├── plan.rs
│ │ │ └── planner.rs
│ │ ├── testscripts/
│ │ │ ├── expressions/
│ │ │ │ ├── cnf
│ │ │ │ ├── func
│ │ │ │ ├── func_sqrt
│ │ │ │ ├── literals
│ │ │ │ ├── op_compare_equal
│ │ │ │ ├── op_compare_greater
│ │ │ │ ├── op_compare_greater_equal
│ │ │ │ ├── op_compare_is_nan
│ │ │ │ ├── op_compare_is_null
│ │ │ │ ├── op_compare_lesser
│ │ │ │ ├── op_compare_lesser_equal
│ │ │ │ ├── op_compare_not_equal
│ │ │ │ ├── op_logic_and
│ │ │ │ ├── op_logic_not
│ │ │ │ ├── op_logic_or
│ │ │ │ ├── op_math_add
│ │ │ │ ├── op_math_divide
│ │ │ │ ├── op_math_exponentiate
│ │ │ │ ├── op_math_factorial
│ │ │ │ ├── op_math_identity
│ │ │ │ ├── op_math_multiply
│ │ │ │ ├── op_math_negate
│ │ │ │ ├── op_math_remainder
│ │ │ │ ├── op_math_subtract
│ │ │ │ ├── op_precedence
│ │ │ │ └── op_string_like
│ │ │ ├── optimizers/
│ │ │ │ ├── constant_folder
│ │ │ │ ├── filter_pushdown
│ │ │ │ ├── hash_join
│ │ │ │ ├── index_lookup
│ │ │ │ └── short_circuit
│ │ │ ├── queries/
│ │ │ │ ├── aggregate
│ │ │ │ ├── clauses
│ │ │ │ ├── group_by
│ │ │ │ ├── having
│ │ │ │ ├── join_cross
│ │ │ │ ├── join_inner
│ │ │ │ ├── join_outer
│ │ │ │ ├── limit
│ │ │ │ ├── offset
│ │ │ │ ├── order
│ │ │ │ ├── select
│ │ │ │ ├── where_
│ │ │ │ ├── where_index
│ │ │ │ └── where_primary_key
│ │ │ ├── schema/
│ │ │ │ ├── create_table
│ │ │ │ ├── create_table_datatypes
│ │ │ │ ├── create_table_default
│ │ │ │ ├── create_table_index
│ │ │ │ ├── create_table_names
│ │ │ │ ├── create_table_null
│ │ │ │ ├── create_table_primary_key
│ │ │ │ ├── create_table_reference
│ │ │ │ ├── create_table_transaction
│ │ │ │ ├── create_table_unique
│ │ │ │ ├── drop_table
│ │ │ │ ├── drop_table_index
│ │ │ │ ├── drop_table_ref
│ │ │ │ └── drop_table_transaction
│ │ │ ├── transactions/
│ │ │ │ ├── anomaly_dirty_read
│ │ │ │ ├── anomaly_dirty_write
│ │ │ │ ├── anomaly_fuzzy_read
│ │ │ │ ├── anomaly_lost_update
│ │ │ │ ├── anomaly_phantom_read
│ │ │ │ ├── anomaly_read_skew
│ │ │ │ ├── anomaly_write_skew
│ │ │ │ ├── begin
│ │ │ │ ├── commit
│ │ │ │ ├── isolation
│ │ │ │ ├── rollback
│ │ │ │ └── schema
│ │ │ └── writes/
│ │ │ ├── delete
│ │ │ ├── delete_index
│ │ │ ├── delete_reference
│ │ │ ├── delete_where
│ │ │ ├── insert
│ │ │ ├── insert_datatypes
│ │ │ ├── insert_default
│ │ │ ├── insert_index
│ │ │ ├── insert_null
│ │ │ ├── insert_primary_key
│ │ │ ├── insert_reference
│ │ │ ├── insert_unique
│ │ │ ├── update
│ │ │ ├── update_datatypes
│ │ │ ├── update_default
│ │ │ ├── update_expression
│ │ │ ├── update_index
│ │ │ ├── update_null
│ │ │ ├── update_primary_key
│ │ │ ├── update_reference
│ │ │ ├── update_unique
│ │ │ └── update_where
│ │ └── types/
│ │ ├── expression.rs
│ │ ├── mod.rs
│ │ ├── schema.rs
│ │ └── value.rs
│ └── storage/
│ ├── bitcask.rs
│ ├── engine.rs
│ ├── memory.rs
│ ├── mod.rs
│ ├── mvcc.rs
│ └── testscripts/
│ ├── bitcask/
│ │ ├── compact
│ │ ├── compact_open
│ │ ├── log
│ │ └── status
│ ├── engine/
│ │ ├── keys
│ │ ├── point
│ │ ├── scan
│ │ └── scan_prefix
│ ├── memory/
│ │ └── status
│ └── mvcc/
│ ├── anomaly_dirty_read
│ ├── anomaly_dirty_write
│ ├── anomaly_fuzzy_read
│ ├── anomaly_lost_update
│ ├── anomaly_phantom_read
│ ├── anomaly_read_skew
│ ├── anomaly_write_skew
│ ├── bank
│ ├── begin
│ ├── begin_as_of
│ ├── begin_readonly
│ ├── delete
│ ├── delete_conflict
│ ├── get
│ ├── get_isolation
│ ├── resume
│ ├── rollback
│ ├── scan
│ ├── scan_isolation
│ ├── scan_key_version_encoding
│ ├── scan_prefix
│ ├── set
│ ├── set_conflict
│ └── unversioned
└── tests/
├── scripts/
│ ├── anomalies
│ ├── client
│ ├── errors
│ ├── isolation
│ └── queries
├── testcluster.rs
└── tests.rs
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/workflows/ci.yml
================================================
name: CI
on: [push, pull_request, workflow_dispatch]
permissions:
contents: read
jobs:
test:
name: Test
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- uses: actions/checkout@v3
- uses: dtolnay/rust-toolchain@1.93.1
id: toolchain
with:
components: clippy, rustfmt
- uses: actions/cache@v3
with:
path: target
key: ${{runner.os}}-target-${{steps.toolchain.outputs.cachekey}}-${{hashFiles('Cargo.lock')}}
- run: cargo build --bins --tests
- run: cargo test
- run: cargo clippy --tests --no-deps -- -D warnings
- run: cargo fmt --check
- run: cargo doc --no-deps
env:
RUSTDOCFLAGS: -D warnings
================================================
FILE: .gitignore
================================================
/cluster/toydb*/data
/data
/docs/crate/target
/target
.DS_Store
.vscode/
**/*.rs.bk
================================================
FILE: Cargo.toml
================================================
[package]
name = "toydb"
version = "1.0.0"
description = "A simple distributed SQL database, built for education"
authors = ["Erik Grinaker <erik@grinaker.org>"]
license = "Apache-2.0"
homepage = "https://github.com/erikgrinaker/toydb"
repository = "https://github.com/erikgrinaker/toydb"
edition = "2024"
default-run = "toydb"
publish = false
[lib]
doctest = false
[dependencies]
bincode = { version = "2.0", features = ["serde"] }
clap = { version = "4.5", features = ["cargo", "derive"] }
config = "0.15"
crossbeam = { version = "0.8", features = ["crossbeam-channel"] }
dyn-clone = "1.0"
fs4 = "0.13"
hdrhistogram = "7.5"
itertools = "0.14"
log = "0.4"
petname = "2.0.2"
rand = "0.10"
regex = "1.12"
rustyline = "17.0"
rustyline-derive = "0.11"
serde = { version = "1.0", features = ["derive"] }
serde_bytes = "0.11"
simplelog = "0.12"
uuid = { version = "1.21", features = ["serde", "v4"] }
[dev-dependencies]
escargot = "0.5"
goldenscript = "0.7"
hex = "0.4"
paste = "1.0"
serde_json = "1.0"
tempfile = "3.25"
test-case = "3.3"
test_each_file = "0.3"
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
# <a><img src="./docs/architecture/images/toydb.svg" height="40" valign="top" /></a> toyDB
Distributed SQL database in Rust, built from scratch as an educational project. Main features:
* [Raft distributed consensus][raft] for linearizable state machine replication.
* [ACID transactions][txn] with MVCC-based snapshot isolation.
* [Pluggable storage engine][storage] with [BitCask][bitcask] and [in-memory][memory] backends.
* [Iterator-based query engine][query] with [heuristic optimization][optimizer] and time-travel
support.
* [SQL interface][sql] including joins, aggregates, and transactions.
toyDB is intended to be simple and understandable, and also functional and correct. Other aspects
like performance, scalability, and availability are non-goals -- these are major sources of
complexity in production-grade databases, and obscure the basic underlying concepts. Shortcuts have
been taken where possible.
I originally wrote toyDB in 2020 to learn more about database internals. Since then, I've spent
several years building real distributed SQL databases at
[CockroachDB](https://github.com/cockroachdb/cockroach) and
[Neon](https://github.com/neondatabase/neon). Based on this experience, I've rewritten toyDB as a
simple illustration of the architecture and concepts behind distributed SQL databases.
[raft]: https://github.com/erikgrinaker/toydb/blob/main/src/raft/mod.rs
[txn]: https://github.com/erikgrinaker/toydb/blob/main/src/storage/mvcc.rs
[storage]: https://github.com/erikgrinaker/toydb/blob/main/src/storage/engine.rs
[bitcask]: https://github.com/erikgrinaker/toydb/blob/main/src/storage/bitcask.rs
[memory]: https://github.com/erikgrinaker/toydb/blob/main/src/storage/memory.rs
[query]: https://github.com/erikgrinaker/toydb/blob/main/src/sql/execution/executor.rs
[optimizer]: https://github.com/erikgrinaker/toydb/blob/main/src/sql/planner/optimizer.rs
[sql]: https://github.com/erikgrinaker/toydb/blob/main/src/sql/parser/parser.rs
## Documentation
* [Architecture guide](docs/architecture/index.md): a guided tour of toyDB's code and architecture.
* [SQL examples](docs/examples.md): walkthrough of toyDB's SQL features.
* [SQL reference](docs/sql.md): reference documentation for toyDB's SQL dialect.
* [References](docs/references.md): research materials used while building toyDB.
## Usage
With a [Rust compiler](https://www.rust-lang.org/tools/install) installed, a local five-node
cluster can be built and started as:
```
$ ./cluster/run.sh
Starting 5 nodes on ports 9601-9605 with data under cluster/*/data/.
To connect to node 1, run: cargo run --release --bin toysql
toydb4 21:03:55 [INFO] Listening on [::1]:9604 (SQL) and [::1]:9704 (Raft)
toydb1 21:03:55 [INFO] Listening on [::1]:9601 (SQL) and [::1]:9701 (Raft)
toydb2 21:03:55 [INFO] Listening on [::1]:9602 (SQL) and [::1]:9702 (Raft)
toydb3 21:03:55 [INFO] Listening on [::1]:9603 (SQL) and [::1]:9703 (Raft)
toydb5 21:03:55 [INFO] Listening on [::1]:9605 (SQL) and [::1]:9705 (Raft)
toydb2 21:03:56 [INFO] Starting new election for term 1
[...]
toydb2 21:03:56 [INFO] Won election for term 1, becoming leader
```
A command-line client can be built and used with node 1 on `localhost:9601`:
```
$ cargo run --release --bin toysql
Connected to toyDB node n1. Enter !help for instructions.
toydb> CREATE TABLE movies (id INTEGER PRIMARY KEY, title VARCHAR NOT NULL);
toydb> INSERT INTO movies VALUES (1, 'Sicario'), (2, 'Stalker'), (3, 'Her');
toydb> SELECT * FROM movies;
1, 'Sicario'
2, 'Stalker'
3, 'Her'
```
toyDB supports most common SQL features, including joins, aggregates, and transactions. Below is an
`EXPLAIN` query plan of a more complex query (fetches all movies from studios that have released any
movie with an IMDb rating of 8 or more):
```
toydb> EXPLAIN SELECT m.title, g.name AS genre, s.name AS studio, m.rating
FROM movies m JOIN genres g ON m.genre_id = g.id,
studios s JOIN movies good ON good.studio_id = s.id AND good.rating >= 8
WHERE m.studio_id = s.id
GROUP BY m.title, g.name, s.name, m.rating, m.released
ORDER BY m.rating DESC, m.released ASC, m.title ASC;
Remap: m.title, genre, studio, m.rating (dropped: m.released)
└─ Order: m.rating desc, m.released asc, m.title asc
└─ Projection: m.title, g.name as genre, s.name as studio, m.rating, m.released
└─ Aggregate: m.title, g.name, s.name, m.rating, m.released
└─ HashJoin: inner on m.studio_id = s.id
├─ HashJoin: inner on m.genre_id = g.id
│ ├─ Scan: movies as m
│ └─ Scan: genres as g
└─ HashJoin: inner on s.id = good.studio_id
├─ Scan: studios as s
└─ Scan: movies as good (good.rating > 8 OR good.rating = 8)
```
## Architecture
toyDB's architecture is fairly typical for a distributed SQL database: a transactional
key/value store managed by a Raft cluster with a SQL query engine on top. See the
[architecture guide](./docs/architecture/index.md) for more details.
[](./docs/architecture/index.md)
## Tests
toyDB mainly uses [Goldenscripts](https://github.com/erikgrinaker/goldenscript) for tests. These
script various scenarios, capture events and output, and later assert that the behavior remains the
same. See e.g.:
* [Raft cluster tests](https://github.com/erikgrinaker/toydb/tree/main/src/raft/testscripts/node)
* [MVCC transaction tests](https://github.com/erikgrinaker/toydb/tree/main/src/storage/testscripts/mvcc)
* [SQL execution tests](https://github.com/erikgrinaker/toydb/tree/main/src/sql/testscripts)
* [End-to-end tests](https://github.com/erikgrinaker/toydb/tree/main/tests/scripts)
Run tests with `cargo test`, or have a look at the latest
[CI run](https://github.com/erikgrinaker/toydb/actions/workflows/ci.yml).
## Benchmarks
toyDB is not optimized for performance, but comes with a `workload` benchmark tool that can run
various workloads against a toyDB cluster. For example:
```sh
# Start a 5-node toyDB cluster.
$ ./cluster/run.sh
[...]
# Run a read-only benchmark via all 5 nodes.
$ cargo run --release --bin workload read
Preparing initial dataset... done (0.179s)
Spawning 16 workers... done (0.006s)
Running workload read (rows=1000 size=64 batch=1)...
Time Progress Txns Rate p50 p90 p99 pMax
1.0s 13.1% 13085 13020/s 1.3ms 1.5ms 1.9ms 8.4ms
2.0s 27.2% 27183 13524/s 1.3ms 1.5ms 1.8ms 8.4ms
3.0s 41.3% 41301 13702/s 1.2ms 1.5ms 1.8ms 8.4ms
4.0s 55.3% 55340 13769/s 1.2ms 1.5ms 1.8ms 8.4ms
5.0s 70.0% 70015 13936/s 1.2ms 1.5ms 1.8ms 8.4ms
6.0s 84.7% 84663 14047/s 1.2ms 1.4ms 1.8ms 8.4ms
7.0s 99.6% 99571 14166/s 1.2ms 1.4ms 1.7ms 8.4ms
7.1s 100.0% 100000 14163/s 1.2ms 1.4ms 1.7ms 8.4ms
Verifying dataset... done (0.002s)
```
The available workloads are:
* `read`: single-row primary key lookups.
* `write`: single-row inserts to sequential primary keys.
* `bank`: bank transfers between various customers and accounts. To make things interesting, this
includes joins, secondary indexes, sorting, and conflicts.
For more information about workloads and parameters, run `cargo run --bin workload -- --help`.
Example workload results are listed below. Write performance is atrocious, due to
[fsync](https://en.wikipedia.org/wiki/Sync_(Unix)) and a lack of write batching in the Raft layer.
Disabling fsync, or using the in-memory engine, significantly improves write performance (at the
expense of durability).
| Workload | BitCask | BitCask w/o fsync | Memory |
|----------|-------------|-------------------|-------------|
| `read` | 14163 txn/s | 13941 txn/s | 13949 txn/s |
| `write` | 35 txn/s | 4719 txn/s | 7781 txn/s |
| `bank` | 21 txn/s | 1120 txn/s | 1346 txn/s |
## Debugging
[VSCode](https://code.visualstudio.com) and the [CodeLLDB](https://marketplace.visualstudio.com/items?itemName=vadimcn.vscode-lldb)
extension can be used to debug toyDB, with the debug configuration under `.vscode/launch.json`.
Under the "Run and Debug" tab, select e.g. "Debug executable 'toydb'" or "Debug unit tests in
library 'toydb'".
## Credits
The toyDB logo is courtesy of [@jonasmerlin](https://github.com/jonasmerlin).
================================================
FILE: cluster/run.sh
================================================
#!/usr/bin/env bash
#
# This script builds and runs a 5-node toyDB cluster listening on ports
# 9601-9605. Config and data is stored under the toydb* directories.
# To connect a toysql client to node 1 on port 9601, run:
#
# cargo run --release --bin toysql
set -euo pipefail
# Change into the script directory.
cd "$(dirname $0)"
# Build toyDB using release optimizations.
cargo build --release --bin toydb
# Start nodes 1-5 in the background, prefixing their output with the node ID.
echo "Starting 5 nodes on ports 9601-9605 with data under cluster/*/data/."
echo "To connect to node 1, run: cargo run --release --bin toysql"
echo ""
for ID in 1 2 3 4 5; do
(cargo run -q --release -- -c toydb$ID/toydb.yaml 2>&1 | sed -e "s/\\(.*\\)/toydb$ID \\1/g") &
done
# Wait for the background processes to exit. Kill all toyDB processes when the
# script exits (e.g. via Ctrl-C).
trap 'kill -TERM -- -$$ 2>/dev/null' INT TERM EXIT
wait
================================================
FILE: cluster/toydb1/toydb.yaml
================================================
id: 1
data_dir: toydb1/data
listen_sql: localhost:9601
listen_raft: localhost:9701
peers:
'2': localhost:9702
'3': localhost:9703
'4': localhost:9704
'5': localhost:9705
================================================
FILE: cluster/toydb2/toydb.yaml
================================================
id: 2
data_dir: toydb2/data
listen_sql: localhost:9602
listen_raft: localhost:9702
peers:
'1': localhost:9701
'3': localhost:9703
'4': localhost:9704
'5': localhost:9705
================================================
FILE: cluster/toydb3/toydb.yaml
================================================
id: 3
data_dir: toydb3/data
listen_sql: localhost:9603
listen_raft: localhost:9703
peers:
'1': localhost:9701
'2': localhost:9702
'4': localhost:9704
'5': localhost:9705
================================================
FILE: cluster/toydb4/toydb.yaml
================================================
id: 4
data_dir: toydb4/data
listen_sql: localhost:9604
listen_raft: localhost:9704
peers:
'1': localhost:9701
'2': localhost:9702
'3': localhost:9703
'5': localhost:9705
================================================
FILE: cluster/toydb5/toydb.yaml
================================================
id: 5
data_dir: toydb5/data
listen_sql: localhost:9605
listen_raft: localhost:9705
peers:
'1': localhost:9701
'2': localhost:9702
'3': localhost:9703
'4': localhost:9704
================================================
FILE: config/toydb.yaml
================================================
# The node ID (must be unique in the cluster), and map of peer IDs and Raft
# addresses (empty for single node).
id: 1
peers: {}
# Addresses to listen for SQL and Raft connections on.
listen_sql: localhost:9601
listen_raft: localhost:9701
# The log level. Valid values are DEBUG, INFO, WARN, and ERROR.
log_level: INFO
# Node data directory. The Raft log is stored in the file "raft", and the SQL
# database in "sql".
data_dir: data
# Storage engine to use for the Raft log and SQL database.
#
# * bitcask (default): an append-only log-structured store.
# * memory: an in-memory store using the Rust standard library's BTreeMap.
storage_raft: bitcask
storage_sql: bitcask
# Whether to fsync writes to disk. Disabling this yields much better write
# performance, but may lose data on host crashes and violate Raft guarantees. It
# only affects Raft log writes (the SQL state machine is never fsynced since it
# can be reconstructed from the Raft log).
fsync: true
# The minimum garbage fraction and bytes to trigger Bitcask log compaction on
# node startup.
compact_threshold: 0.2
compact_min_bytes: 1000000
================================================
FILE: docs/architecture/README.md
================================================
See [`index.md`](index.md).
================================================
FILE: docs/architecture/client.md
================================================
# Client
The toyDB client is in the [`client`](https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/client.rs)
module. It uses the same Bincode-based protocol that we saw in the server section, sending
`toydb::Request` and receiving `toydb::Response`.
## Client Library
The main client library `toydb::Client` is used to communicate with a toyDB server:
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/client.rs#L15-L24
When initialized, it connects to a toyDB server over TCP, which establishes a SQL session for it:
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/client.rs#L27-L33
It can then send Bincode-encoded `toydb::Request` to the server, and receive `toydb::Response`
back.
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/client.rs#L35-L40
In particular, `Client::execute` can be used to execute arbitrary SQL statements in the client's
current session:
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/client.rs#L42-L56
## `toysql` Binary
However, `toydb::Client` is a programmatic API, and we want a more convenient user interface.
The `toysql` client in [`src/bin/toysql.rs`](https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/bin/toysql.rs)
provides a typical [REPL](https://en.wikipedia.org/wiki/Read–eval–print_loop) (read-evaluate-print loop) where users can enter SQL statements and view the results.
Like `toydb`, `toysql` is a tiny [`clap`](https://docs.rs/clap/latest/clap/) command that takes a
toyDB server address to connect to and starts an interactive shell:
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/bin/toysql.rs#L29-L53
It first attempts to connect to the toyDB server using the `toydb::Client` client, and then starts
an interactive shell using the [Rustyline](https://docs.rs/rustyline/latest/rustyline/) library.
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/bin/toysql.rs#L55-L81
The shell is simply a loop that prompts the user to input a SQL statement:
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/bin/toysql.rs#L216-L250
Each statement is the executed against the server via `toydb::Client::execute`, and the response
is formatted and printed as output:
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/bin/toysql.rs#L83-L92
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/bin/toysql.rs#L175-L204
And with that, we have a fully functional SQL database system and can run queries to our heart's
content. Have fun!
---
<p align="center">
← <a href="server.md">Server</a>
</p>
================================================
FILE: docs/architecture/encoding.md
================================================
# Key/Value Encoding
The key/value store uses binary `Vec<u8>` keys and values, so we need an encoding scheme to
translate between in-memory Rust data structures and the on-disk binary data. This is provided by
the [`encoding`](https://github.com/erikgrinaker/toydb/tree/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/encoding)
module, with separate schemes for key and value encoding.
## `Bincode` Value Encoding
Values are encoded using [Bincode](https://github.com/bincode-org/bincode), a third-party binary
encoding scheme for Rust. Bincode is convenient because it can easily encode any arbitrary Rust
data type. But we could also have chosen e.g. [JSON](https://en.wikipedia.org/wiki/JSON),
[Protobuf](https://protobuf.dev), [MessagePack](https://msgpack.org/), or any other encoding.
We won't dwell on the actual binary format here, see the [Bincode specification](https://git.sr.ht/~stygianentity/bincode/tree/trunk/item/docs/spec.md)
for details.
To use a consistent configuration for all encoding and decoding, we provide helper functions in
the [`encoding::bincode`](https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/encoding/bincode.rs)
module which use `bincode::config::standard()`.
https://github.com/erikgrinaker/toydb/blob/0ce1fb34349fda043cb9905135f103bceb4395b4/src/encoding/bincode.rs#L15-L27
Bincode uses the very common [Serde](https://serde.rs) framework for its API. toyDB also provides an
`encoding::Value` helper trait for value types which adds automatic `encode()` and `decode()`
methods:
https://github.com/erikgrinaker/toydb/blob/b57ae6502e93ea06df00d94946a7304b7d60b977/src/encoding/mod.rs#L39-L68
Here's an example of how this can be used to encode and decode an arbitrary `Dog` data type:
```rust
#[derive(serde::Serialize, serde::Deserialize)]
struct Dog {
name: String,
age: u8,
good_boy: bool,
}
impl encoding::Value for Dog {}
let pluto = Dog { name: "Pluto".into(), age: 4, good_boy: true };
let bytes = pluto.encode();
println!("{bytes:02x?}");
// Outputs [05, 50, 6c, 75, 74, 6f, 04, 01]:
//
// * Length of string "Pluto": 05.
// * String "Pluto": 50 6c 75 74 6f.
// * Age 4: 04.
// * Good boy: 01 (true).
let pluto = Dog::decode(&bytes)?; // gives us back Pluto
```
## `Keycode` Key Encoding
Unlike values, keys can't just use any binary encoding like Bincode. As mentioned in the storage
section, the storage engine sorts data by key to enable range scans. The key encoding must therefore
preserve the [lexicographical order](https://en.wikipedia.org/wiki/Lexicographic_order) of the
encoded values: the binary byte slices must sort in the same order as the original values.
As an example of why we can't just use Bincode, consider the strings "house" and "key". These should
be sorted in alphabetical order: "house" before "key". However, Bincode encodes strings prefixed by
their length, so "key" would be sorted before "house" in binary form:
```
03 6b 65 79 ← 3 bytes: key
05 68 6f 75 73 65 ← 5 bytes: house
```
For similar reasons, we can't just encode numbers in their native binary form: the
[little-endian](https://en.wikipedia.org/wiki/Endianness) representation will order very large
numbers before small numbers, and the [sign bit](https://en.wikipedia.org/wiki/Sign_bit) will order
positive numbers before negative numbers. This would violate the ordering of natural numbers.
We also have to be careful with value sequences, which should be ordered element-wise. For example,
the pair ("a", "xyz") should be ordered before ("ab", "cd"), so we can't just encode the strings
one after the other like "axyz" and "abcd" since that would sort ("ab", "cd") first.
toyDB provides an order-preserving encoding called "Keycode" in the [`encoding::keycode`](https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/encoding/keycode.rs)
module. Like Bincode, the Keycode encoding is not self-describing: the binary data does not say what
the data type is, the caller must provide a type to decode into. It only supports a handful of
primitive data types, and only needs to order values of the same type.
Keycode is implemented as a [Serde](https://serde.rs) (de)serializer, which requires a lot of
boilerplate code to satisfy the trait, but we'll just focus on the actual encoding. The encoding
scheme is as follows:
* `bool`: `00` for `false` and `01` for `true`.
https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L113-L117
* `u64`: the [big-endian](https://en.wikipedia.org/wiki/Endianness) binary encoding.
https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L157-L161
* `i64`: the [big-endian](https://en.wikipedia.org/wiki/Endianness) binary encoding, but with the
sign bit flipped to order negative numbers before positive ones.
https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L131-L143
* `f64`: the [big-endian IEEE 754](https://en.wikipedia.org/wiki/Double-precision_floating-point_format)
binary encoding, but with the sign bit flipped, and all bits flipped for negative numbers, to
order negative numbers correctly.
https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L167-L179
* `Vec<u8>`: terminated by `00 00`, with `00` escaped as `00 ff` to disambiguate it.
https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L190-L205
* `String`: like `Vec<u8>`.
https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L185-L188
* `Vec<T>`, `[T]`, `(T,)`: the concatenation of the inner values.
https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L295-L307
* `enum`: the variant's numerical index as a `u8`, then the inner values (if any).
https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L223-L227
Like `encoding::Value`, there is also an `encoding::Key` helper trait:
https://github.com/erikgrinaker/toydb/blob/b57ae6502e93ea06df00d94946a7304b7d60b977/src/encoding/mod.rs#L20-L37
Different kinds of keys are usually represented as enums. For example, if we wanted to store cars
and video games, we could use:
```rust
#[derive(serde::Serialize, serde::Deserialize)]
enum Key {
Car(String, String, u64), // make, model, year
Game(String, u64, Platform), // name, year, platform
}
#[derive(serde::Serialize, serde::Deserialize)]
enum Platform {
PC,
PS5,
Switch,
Xbox,
}
impl encoding::Key for Key {}
let returnal = Key::Game("Returnal".into(), 2021, Platform::PS5);
let bytes = returnal.encode();
println!("{bytes:02x?}");
// Outputs [01, 52, 65, 74, 75, 72, 6e, 61, 6c, 00, 00, 00, 00, 00, 00, 00, 00, 07, e5, 01].
//
// * Key::Game: 01
// * Returnal: 52 65 74 75 72 6e 61 6c 00 00
// * 2021: 00 00 00 00 00 00 07 e5
// * Platform::PS5: 01
let returnal = Key::decode(&bytes)?;
```
Because the keys are sorted in element-wise order, this would allow us to e.g. perform a prefix
scan to fetch all platforms which Returnal (2021) was released on, or perform a range scan to fetch
all models of Nissan Altima released between 2010 and 2015.
---
<p align="center">
← <a href="storage.md">Storage Engine</a> | <a href="mvcc.md">MVCC Transactions</a> →
</p>
================================================
FILE: docs/architecture/index.md
================================================
# toyDB Architecture
toyDB is a simple distributed SQL database, intended to illustrate how such systems are built. The
overall structure is similar to real-world distributed databases, but the design and implementation
has been kept as simple as possible for understandability. Performance and scalability are explicit
non-goals, as these are major sources of complexity in real-world systems.
This guide will walk through toyDB's architecture and code from the bottom up, with plenty of links
to the actual source code.
> ℹ️ View on GitHub with a desktop browser for inline code listings.
* [Overview](overview.md)
* [Properties](overview.md#properties)
* [Components](overview.md#components)
* [Storage Engine](storage.md)
* [`Memory` Storage Engine](storage.md#memory-storage-engine)
* [`BitCask` Storage Engine](storage.md#bitcask-storage-engine)
* [Key/Value Encoding](encoding.md)
* [`Bincode` Value Encoding](encoding.md#bincode-value-encoding)
* [`Keycode` Key Encoding](encoding.md#keycode-key-encoding)
* [MVCC Transactions](mvcc.md)
* [Raft Consensus](raft.md)
* [Log Storage](raft.md#log-storage)
* [State Machine Interface](raft.md#state-machine-interface)
* [Node Roles](raft.md#node-roles)
* [Node Interface and Communication](raft.md#node-interface-and-communication)
* [Leader Election and Terms](raft.md#leader-election-and-terms)
* [Client Requests and Forwarding](raft.md#client-requests-and-forwarding)
* [Write Replication and Application](raft.md#write-replication-and-application)
* [Read Processing](raft.md#read-processing)
* [SQL Engine](sql.md)
* [Data Model](sql-data.md)
* [Data Types](sql-data.md#data-types)
* [Schemas](sql-data.md#schemas)
* [Expressions](sql-data.md#expressions)
* [Storage](sql-storage.md)
* [Key/Value Representation](sql-storage.md#keyvalue-representation)
* [Schema Catalog](sql-storage.md#schema-catalog)
* [Row Storage and Transactions](sql-storage.md#row-storage-and-transactions)
* [Raft Replication](sql-raft.md)
* [Parsing](sql-parser.md)
* [Lexer](sql-parser.md#lexer)
* [Abstract Syntax Tree](sql-parser.md#abstract-syntax-tree)
* [Parser](sql-parser.md#parser)
* [Planning](sql-planner.md)
* [Execution Plan](sql-planner.md#execution-plan)
* [Scope and Name Resolution](sql-planner.md#scope-and-name-resolution)
* [Planner](sql-planner.md#planner)
* [Optimization](sql-optimizer.md)
* [Constant Folding](sql-optimizer.md#constant-folding)
* [Filter Pushdown](sql-optimizer.md#filter-pushdown)
* [Index Lookups](sql-optimizer.md#index-lookups)
* [Hash Join](sql-optimizer.md#hash-join)
* [Short Circuiting](sql-optimizer.md#short-circuiting)
* [Execution](sql-execution.md)
* [Plan Executor](sql-execution.md#plan-executor)
* [Session Management](sql-execution.md#session-management)
* [Server](server.md)
* [Raft Routing](server.md#raft-routing)
* [SQL Service](server.md#sql-service)
* [`toydb` Binary](server.md#toydb-binary)
* [Client](client.md)
* [Client Library](client.md#client-library)
* [`toysql` Binary](client.md#toysql-binary)
---
<p align="center">
<a href="overview.md">Overview</a> →
</p>
================================================
FILE: docs/architecture/mvcc.md
================================================
# MVCC Transactions
Transactions are groups of reads and writes (e.g. to different keys) that are submitted together as
a single unit. For example, a bank transaction that transfers $100 from account A to account B might
consist of this group of reads and writes:
```
a = get(A)
b = get(B)
if a < 100:
error("insufficient balance")
set(A, a - 100)
set(B, b + 100)
```
toyDB provides [ACID](https://en.wikipedia.org/wiki/ACID) transactions, a set of very strong
guarantees:
* **Atomicity:** all of the writes take effect as an single, atomic unit, at the same instant, when
they are _committed_. Other users will never see some of the writes without the others.
* **Consistency:** database constraints are never violated (e.g. referential integrity or uniqueness
contraints). We'll see how this is implemented later in the SQL execution layer.
* **Isolation:** users should appear to have the entire database to themselves, unaffected by other
simultaneous users. Two transactions may conflict, in which case one has to retry, but if a
transaction succeeds then the user knows with certainty that the operations were executed without
interference by anyone else. This eliminates the risk of [race conditions](https://en.wikipedia.org/wiki/Race_condition).
* **Durability:** committed writes are never lost (even if the system crashes).
To illustrate how transactions work, here's an example MVCC test script where two concurrent users
modify a set of bank accounts (there's many [other test scripts](https://github.com/erikgrinaker/toydb/tree/aa14deb71f650249ce1cab8828ed7bcae2c9206e/src/storage/testscripts/mvcc)
there too):
https://github.com/erikgrinaker/toydb/blob/a73e24b7e77671b9f466e0146323cd69c3e27bdf/src/storage/testscripts/mvcc/bank#L1-L69
To provide these guarantees, toyDB uses a common technique called
[Multi-Version Concurrency Control](https://en.wikipedia.org/wiki/Multiversion_concurrency_control)
(MVCC). It is implemented at the key/value storage level, in the [`storage::mvcc`](https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs)
module. It uses a `storage::Engine` for actual data storage.
https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L220-L231
MVCC provides an [isolation level](https://en.wikipedia.org/wiki/Isolation_(database_systems)#Isolation_levels)
called [snapshot isolation](https://en.wikipedia.org/wiki/Snapshot_isolation): a transaction sees a
snapshot of the database as it was when the transaction began. Any later changes are invisible to
it.
It does this by storing historical versions of key/value pairs. The version number is simply a
number that's incremented for every new transaction:
https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L155-L158
Each transaction has its own unique version number. When it writes a key/value pair it appends its
version number to the key as `Key::Version(&[u8], Version)` (using the Keycode encoding we've seen
previously). If an old version of the key already exists, it will have a different version number
suffix and therefore be stored as a separate key in the storage engine. Deleted keys are versions
with a special tombstone value.
https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L183-L189
Here's a simple diagram of what a history of versions 1 to 5 of keys `a` to `d` might look like:
https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L11-L26
Additionally, we need to keep track of the currently ongoing (uncommitted) transaction versions,
known as the "active set".
With versioning and the active set, we can summarize the MVCC protocol with a few simple rules:
1. When a new transaction begins, it:
* Obtains the next available version number.
* Takes a snapshot of the active set (other uncommitted transactions).
* Adds its version number to the active set.
2. When the transaction reads a key, it:
* Returns the latest version of the key at or below its own version.
* Ignores versions above its own version.
* Ignores versions in its active set snapshot.
3. When the transaction writes a key, it:
* Looks for a key version above its own version; errors if found.
* Looks for a key version in its active set snapshot; errors if found.
* Writes a key/value pair with its own version.
4. When the transaction commits, it:
* Flushes all writes to disk.
* Removes itself from the active set.
The magic happens when the transaction removes itself from the active set. This is a single, atomic
operation, and when it completes all of its writes immediately become visible to _new_ transactions.
However, ongoing transactions still won't see these writes, because the version is still in their
active set snapshot or at a later version (hence they are isolated from this transaction).
Furthermore, the transaction could see its own uncommitted writes even though noone else could, and
if any writes conflicted with another transaction it would error out and have to retry.
Not only that, this also allows us to do time-travel queries, where we can query the database as it
was at any time in the past: we simply pick a version number to read at.
There are a few more details that we've left out here: transaction rollbacks need to keep track of
the writes and undo them, and read-only queries can avoid allocating new version numbers. We also
don't garbage collect old version, for simplicity. See the module documentation for more details:
https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L1-L140
Let's walk through a simple example with code pointers to get a feel for how this is implemented.
Notice how we don't have to deal with any version numbers when we're using the MVCC API -- this is
an internal MVCC implementation detail.
```rust
// Open a BitCask database in the file "toy.db" with MVCC support.
let path = PathBuf::from("toy.db");
let db = MVCC::new(BitCask::new(path)?);
// Begin a new transaction.
let txn = db.begin()?;
// Read the key "foo", and decode the binary value as a u64 with bincode.
let bytes = txn.get(b"foo")?.expect("foo not found");
let mut value: u64 = bincode::deserialize(&bytes)?;
// Delete "foo".
txn.delete(b"foo")?;
// Add 1 to the value, and write it back to the key "bar".
value += 1;
let bytes = bincode::serialize(&value);
txn.set(b"bar", bytes)?;
// Commit the transaction.
txn.commit()?;
```
First, we begin a new transaction with `MVCC::begin()`, which calls through to
`Transaction::begin()`. This obtains a version number stored in `Key::NextVersion` and increments
it, then takes a snapshot of the active set in `Key::ActiveSet` and adds itself to it:
https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L368-L391
This returns a `Transaction` object which provides the main key/value API, with get/set/delete
methods. It keeps track of the main state of the transaction: it's version number and active set.
https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L294-L327
Next, we call `Transaction::get(b"foo")` to read the value of the key `foo`. This finds the latest
version that's visible to us (ignoring future versions and the active set). Recall that we store
multiple version of each key as `Key::Version(key, version)`. The Keycode encoding ensures that all
versions are stored in sorted order, so we can do a reverse range scan from `Key::Version(b"foo",
self.version)` to `Key::Version(b"foo", 0)` and return the latest version that's visible to us:
https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L564-L581
We then call `Transaction::delete(b"foo")` and `Transaction::set(b"bar", value)`. Both of these just
call through to the same `Transaction::write_version()` method, but use `Some(value)` for a regular
key/value pair and `None` as a deletion tombstone:
https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L514-L522
To write a new version of a key, we first have to check for conflicts by seeing if there's a
version of the key that's invisible to us -- if it is, we conflicted with a concurrent transaction.
We use a range scan for this, like we did in `Transaction::get()`.
If there are no conflicts, we go on to write `Key::Version(b"foo", self.version)` and encode the
value as an `Option<value>` to accomodate the `None` tombstone marker. We also write a
`Key::TxnWrite(version, key)` to keep track of the keys we've written in case we have to roll back.
https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L524-L562
Finally, `Transaction::commit()` will make our transaction take effect and become visible. It does
this simply by removing itself from the active set in `Key::ActiveSet`, and also cleaning up its
`Key::TxnWrite` write tracking. As the comment says, we don't actually have to flush to durable
storage here, because the Raft log will provide durability for us -- we'll get back to this later.
https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L466-L485
---
<p align="center">
← <a href="encoding.md">Key/Value Encoding</a> | <a href="raft.md">Raft Consensus</a> →
</p>
================================================
FILE: docs/architecture/overview.md
================================================
# Overview
toyDB consists of a cluster of nodes that execute [SQL](https://en.wikipedia.org/wiki/SQL)
transactions against a replicated state machine. Clients can connect to any node in the cluster and
submit SQL statements. The cluster remains available if a minority of nodes crash or disconnect,
but halts if a majority of nodes fail.
## Properties
* **Distributed:** runs across a cluster of nodes.
* **Highly available:** tolerates failure of a minority of nodes.
* **SQL compliant:** correctly supports most common [SQL](https://en.wikipedia.org/wiki/SQL)
features.
* **Strongly consistent:** committed writes are immediately visible to all readers ([linearizability](https://en.wikipedia.org/wiki/Linearizability)).
* **Transactional:** provides [ACID](https://en.wikipedia.org/wiki/ACID) transactions
* **Atomic:** groups of writes are applied as a single, atomic unit.
* **Consistent:** database constraints and referential integrity are always enforced.
* **Isolated:** concurrent transactions don't affect each other ([snapshot isolation](https://en.wikipedia.org/wiki/Snapshot_isolation)).
* **Durable:** committed writes are never lost.
For simplicity, toyDB is:
* **Not scalable:** every node stores the full dataset, and reads/writes execute on one node.
* **Not reliable:** only handles crash failures, not e.g. partial network partitions or node stalls.
* **Not performant:** data processing is slow, and not optimized at all.
* **Not efficient:** loads entire tables into memory, no compression or garbage collection, etc.
* **Not full-featured:** only basic SQL functionality is implemented.
* **Not backwards compatible:** changes to data formats and protocols will break databases.
* **Not flexible:** nodes can't be added or removed while running, and take a long time to join.
* **Not secure:** there is no authentication, authorization, nor encryption.
## Components
Internally, toyDB is made up of a few main components:
* **Storage engine:** stores data on disk and manages transactions.
* **Raft consensus engine:** replicates data and coordinates cluster nodes.
* **SQL engine:** organizes SQL data, manages SQL sessions, and executes SQL statements.
* **Server:** manages network communication, both with SQL clients and Raft nodes.
* **Client:** provides a SQL user interface and communicates with the server.
This diagram illustrates the internal structure of a single toyDB node:

We will go through each of these components from the bottom up.
---
<p align="center">
← <a href="index.md">toyDB Architecture</a> | <a href="storage.md">Storage Engine</a> →
</p>
================================================
FILE: docs/architecture/raft.md
================================================
# Raft Consensus
[Raft](https://raft.github.io) is a distributed consensus protocol which replicates data across a
cluster of nodes in a consistent and durable manner. It is described in the very readable
[Raft paper](https://raft.github.io/raft.pdf), and in the more comprehensive
[Raft thesis](https://web.stanford.edu/~ouster/cgi-bin/papers/OngaroPhD.pdf).
The toyDB Raft implementation is in the [`raft`](https://github.com/erikgrinaker/toydb/tree/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/raft)
module, and is described in the module documentation:
https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/mod.rs#L1-L240
Raft is fundamentally the same protocol as [Paxos](https://lamport.azurewebsites.net/pubs/paxos-simple.pdf)
and [Viewstamped Replication](https://pmg.csail.mit.edu/papers/vr-revisited.pdf), but an
opinionated variant designed to be simple, understandable, and practical. It is widely used in the
industry: [CockroachDB](https://www.cockroachlabs.com), [TiDB](https://www.pingcap.com),
[etcd](https://etcd.io), [Consul](https://developer.hashicorp.com/consul), and many others.
Briefly, Raft elects a leader node which coordinates writes and replicates them to followers. Once a
majority (>50%) of nodes have acknowledged a write, it is considered durably committed. It is common
for the leader to also serve reads, since it always has the most recent data and is thus strongly
consistent.
A cluster must have a majority of nodes (known as a [quorum](https://en.wikipedia.org/wiki/Quorum_(distributed_computing)))
live and connected to remain available, otherwise it will not commit writes in order to guarantee
data consistency and durability. Since there can only be one majority in the cluster, this prevents
a [split brain](https://en.wikipedia.org/wiki/Split-brain_(computing)) scenario where two active
leaders can exist concurrently (e.g. during a [network partition](https://en.wikipedia.org/wiki/Network_partition))
and store conflicting values.
The Raft leader appends writes to an ordered command log, which is then replicated to followers.
Once a majority has replicated the log up to a given entry, that log prefix is committed and then
applied to a state machine. This ensures that all nodes will apply the same commands in the same
order and eventually reach the same state (assuming the commands are deterministic). Raft itself
doesn't care what the state machine and commands are, but in toyDB's case it's SQL tables and rows
stored in an MVCC key/value store.
This diagram from the Raft paper illustrates how a Raft node receives a command from a client (1),
adds it to its log and reaches consensus with other nodes (2), then applies it to its state machine
(3) before returning a result to the client (4):
<img src="./images/raft.svg" alt="Raft node" width="400" style="display: block; margin: 30px auto;">
You may notice that Raft is not very scalable, since all reads and writes go via the leader node,
and every node must store the entire dataset. Raft solves replication and availability, but not
scalability. Real-world systems typically provide horizontal scalability by splitting a large
dataset across many separate Raft clusters (i.e. sharding), but this is out of scope for toyDB.
For simplicitly, toyDB implements the bare minimum of Raft, and omits optimizations described in
the paper such as state snapshots, log truncation, leader leases, and more. The implementation is
in the [`raft`](https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/mod.rs)
module, and we'll walk through the main components next.
There is a comprehensive set of Raft test scripts in [`src/raft/testscripts/node`](https://github.com/erikgrinaker/toydb/blob/386153f5c00cb1a88b1ac8489ae132674d96f68a/src/raft/testscripts/node),
which illustrate the protocol in a wide variety of scenarios.
## Log Storage
Raft replicates an ordered command log consisting of `raft::Entry`:
https://github.com/erikgrinaker/toydb/blob/90a6cae47ac20481ac4eb2f20eea50f02e6c2b33/src/raft/log.rs#L10-L28
`index` specifies the position in the log, and `command` contains the binary command to apply to the
state machine. The `term` identifies the leadership term in which the command was proposed: a new
term begins when a new leader election is held (we'll get back to this later).
Entries are appended to the log by the leader and replicated to followers. Once acknowledged by a
quorum, the log up to that index is committed and will never change. Entries that are not yet
committed may be replaced or removed if the leader changes.
The Raft log enforces the following invariants:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L80-L91
`raft::Log` implements a Raft log, and stores log entries in a `storage::Engine` key/value store:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L43-L116
It also stores some additional metadata that we'll need later: the current term, vote, and commit
index. These are stored as separate keys:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L30-L39
Individual entries are appended to the log via `Log::append`, typically when the leader wants to
replicate a new write:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L190-L203
Entries can also be appended in bulk via `Log::splice`, typically when entries are replicated to
followers. This also allows replacing existing uncommitted entries, e.g. after a leader change:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L269-L343
Committed entries are marked by `Log::commit`, making them immutable and eligible for state machine
application:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L205-L222
The log also has methods to read entries from the log, either individually as `Log::get` or by
iterating over a range with `Log::scan`:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L224-L267
## State Machine Interface
Raft doesn't know or care what the log commands are, nor what the state machine does with them. It
simply takes `raft::Entry` from the log and gives them to the state machine.
The Raft state machine is represented by the `raft::State` trait. Raft will ask about the last
applied entry via `State::get_applied_index`, and feed it newly committed entries via
`State::apply`. It also allows reads via `State::read`, but we'll get back to that later.
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/state.rs#L4-L51
The state machine does not have to flush its state to durable storage after each transition; on node
crashes, the state machine is allowed to regress, and will be caught up by replaying the unapplied
log entries. It is also possible to implement a purely in-memory state machine (and in fact, toyDB
allows running the state machine with a `Memory` storage engine).
The state machine must take care to be deterministic: the same commands applied in the same order
must result in the same state across all nodes. This means that a command can't e.g. read the
current time or generate a random number -- these values must be included in the command. It also
means that non-deterministic errors, such as an IO error, must halt command application (in toyDB's
case, we just panic and crash the node).
In toyDB's, the state machine is an MVCC key/value store that stores SQL tables and rows, as we'll
see in the SQL Raft replication section.
## Node Roles
In Raft, a node can have one out of three roles:
* **Leader:** replicates writes to followers and serves client requests.
* **Follower:** replicates writes from a leader.
* **Candidate:** campaigns for leadership.
The Raft paper summarizes these roles and transitions in the following diagram (we'll discuss
leader election in detail below):
<img src="./images/raft-states.svg" alt="Raft states" width="400" style="display: block; margin: 30px auto;">
In toyDB, a node is represented by the `raft::Node` enum, with variants for each state:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L47-L66
This wraps the `raft::RawNode<Role>` type which contains the inner node state. It is generic over
the role, and uses the [typestate pattern](http://cliffle.com/blog/rust-typestate/) to provide
methods and transitions depending on the node's current role. This enforces state transitions and
invariants at compile time via Rust's type system -- for example, only `RawNode<Candidate>` has an
`into_leader()` method, since only candidates can transition to leaders (when they win an election).
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L156-L177
The `RawNode::role` field contains role-specific state as structs implementing the `Role` marker
trait:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L661-L680
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L242-L255
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L523-L531
We'll see what the various fields are used for in the following sections.
## Node Interface and Communication
The `raft::Node` enum has two main methods that drive the node: `tick()` and `step()`. These consume
the current node and return a new node, possibly with a different role.
`tick()` advances time by a logical tick. This is used to measure the passage of time, e.g. to
trigger election timeouts or periodic leader heartbeats. toyDB uses a tick interval of 100
milliseconds (see `raft::TICK_INTERVAL`), and will call `tick()` on the node at this rate.
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L125-L132
`step()` processes an inbound message from a different node or client:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L107-L123
Outbound messages to other nodes are sent via the `RawNode::tx` channel:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L171-L172
Nodes are identified by a unique node ID, which is given at node startup:
https://github.com/erikgrinaker/toydb/blob/90a6cae47ac20481ac4eb2f20eea50f02e6c2b33/src/raft/node.rs#L17-L18
Messages are wrapped in a `raft::Envelope` specifying the sender and recipient:
https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/message.rs#L10-L21
The envelope contains a `raft::Message`, an enum which encodes the Raft message protocol. We won't
dwell on the specific message types here, but discuss them invididually in the following sections.
Raft does not require reliable message delivery, so messages may be dropped or reordered at any
time, although toyDB's use of TCP provides stronger delivery guarantees.
https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/message.rs#L25-L152
This is an entirely synchronous and deterministic model -- the same sequence of calls on a given
node in a given initial state will always produce the same result. This is very convenient for
testing and understandability. We will see in the server section how toyDB drives the node on a
separate thread, provides a network transport for messages, and ticks it at regular intervals.
## Leader Election and Terms
In the steady state, Raft simply has a leader which replicates writes to followers. But to reach
this steady state, we must elect a leader, which is where much of the subtle complexity lies. See
the Raft paper for comprehensive details and safety arguments, we'll summarize it briefly below.
Raft divides time into _terms_. The term is a monotonically increasing number starting at 1. There
can only be one leader in a term (or none if an election fails), and the term can never regress.
Replicated commands belong to the specific term under which they were proposed.
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L20-L21
Let's walk through an election, where we bootstrap a brand new, empty toyDB cluster with 3 nodes.
Nodes are initialized by calling `Node::new()`. Since this is a new cluster, they are given an empty
`raft::Log` and `raft::State`, at term 0. Nodes start with role `Follower`, but without a leader.
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L68-L87
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L266-L290
Now, nothing really happens for a while, as the nodes are waiting to maybe hear from an existing
leader (there is none). Every 100 ms we call `tick()`, until we reach `election_timeout`:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L489-L497
Notice how `new()` set `election_timeout` to a random value (in the range `ELECTION_TIMEOUT_RANGE`
of 10-20 ticks, i.e. 1-2 seconds). If all nodes had the same timeout, they would likely campaign for
leadership simultaneously, resulting in an election tie -- Raft uses randomized election timeouts to
avoid such ties.
Once a node reaches `election_timeout` it transitions to role `Candidate`:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L292-L312
When it becomes a candidate it campaigns for leadership by increasing its term to 1, voting for
itself, and sending `Message::Campaign` to all peers asking for their vote:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L647-L658
In Raft, the term can't regress, and a node can only cast a single vote in each term (even across
restarts), so both of these are persisted to disk via `Log::set_term_vote()`.
When the two other nodes (still in state `Follower`) receive the `Message::Campaign` asking for a
vote, they will first increase their term to 1 (since this is a newer term than their local term 0):
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L347-L351
They then grant the vote since they haven't yet voted for anyone else in term 1. They persist the
vote to disk via `Log::set_term_vote()` and return a `Message::CampaignResponse { vote: true }` to
the candidate:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L424-L449
They also check that the candidate's log is at least as long as theirs, which is trivially true in
this case since the log is empty. This is necessary to ensure that a leader has all committed
entries (see section 5.4.1 in the Raft paper).
When the candidate receives the `Message::CampaignResponse` it records the vote from each node. Once
it has a quorum (in this case 2 out of 3 votes including its own vote) it becomes leader in term 1:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L599-L606
When it becomes leader, it sends a `Message::Heartbeat` to all peers to tell them it is now the
leader in term 1. It also appends an empty entry to its log and replicates it, but we will ignore
this for now (see section 5.4.2 in the Raft paper for why).
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L563-L583
When the other nodes receive the heartbeat, they become followers of the new leader in its term:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L359-L384
From now on, the leader will send periodic `Message::Heartbeat` every 4 ticks (see
`HEARTBEAT_INTERVAL`) to assert its leadership:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L945-L953
The followers record when they last received any message from the leader (including heartbeats), and
will hold a new election if they haven't heard from the leader in an election timeout (e.g. due to a
leader crash or network partition):
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L353-L356
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L489-L497
This entire process is illustrated in the test script [`election`](https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/election),
along with several other test scripts that show e.g. [election ties](https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/election_tie),
[contested elections](https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/election_contested),
and other scenarios:
https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/election#L1-L72
## Client Requests and Forwarding
Once a leader has been elected, we can submit read and write requests to it. This is done by
stepping a `Message::ClientRequest` into the node using the local node ID, with a unique request ID
(toyDB uses UUIDv4), and waiting for an outbound response message with the same ID:
https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/message.rs#L134-L151
https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/message.rs#L164-L188
The requests and responses themselves are arbitrary binary data which is interpreted by the state
machine. For our purposes here, let's pretend the requests are:
* `Request::Write("key=value")` → `Response::Write("ok")`
* `Request::Read("key")` → `Response::Read("value")`
The fundamental difference between read and write requests are that write requests are replicated
through Raft and executed on all nodes, while read requests are only executed on the leader without
being appended to the log. It would be possible to execute reads on followers too, for load
balancing, but these reads would be eventually consistent and thus violate linearizability, so toyDB
only executes reads on the leader.
If a request is submitted to a follower, it will be forwarded to the leader and the response
forwarded back to the client (distinguished by the sender/recipient node ID -- a local client always
uses the local node ID):
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L451-L474
For simplicity, we cancel the request with `Error::Abort` if a request is submitted to a candidate,
and similarly if a follower changes its role to candidate or discovers a new leader. We could have
held on to these and redirected them to a new leader, but we keep it simple and ask the client to
retry.
We'll look at the actual read and write request processing next.
## Write Replication and Application
When the leader receives a write request, it proposes the command for replication to followers. It
keeps track of the in-flight write and its log entry index in `writes`, such that it can respond to
the client with the command result once the entry has been committed and applied.
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L895-L904
To propose the command, the leader appends it to its log and sends a `Message::Append` to each
follower to replicate it to their logs:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L966-L980
In steady state, `Message::Append` just contains the single log entry we appended above:
https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/message.rs#L87-L108
However, sometimes followers may be lagging behind the leader (e.g. after a crash), or their log may
have diverged from the leader (e.g. unsuccessful proposals from a stale leader after a network
partition). To handle these cases, the leader tracks the replication progress of each follower as
`raft::Progress`:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L682-L698
We'll gloss over these cases here (see the Raft paper and the code in `raft::Progress` and
`maybe_send_append()` for details). In the steady state, where each entry is successfully appended
and replicated one at a time, `maybe_send_append()` will fall through to the bottom and send a
single entry:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L1068-L1128
The `Message::Append` contains the index/term of the entry immediately before the new entry as
`base_index` and `base_term`. If the follower's log also contains an entry with this index and term
then its log is guaranteed to match (be equal to) the leader's log up to this entry (see section 5.3
in the Raft paper). The follower can then append the new log entry and return a
`Message::AppendResponse` confirming that the entry was appended and that its log matches the
leader's log up to `match_index`:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L386-L410
When the leader receives the `Message::AppendResponse`, it will update its view of the follower's
`match_index`.
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L844-L858
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L701-L710
Once a quorum of nodes (in our case 2 out of 3 including the leader) have the entry in their log,
the leader can commit the entry and apply it to the state machine. It also looks up the in-flight
write request from `writes` and sends the command result back to the client as
`Message::ClientResponse`:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L982-L1032
The leader will also propagate the new commit index to followers via the next heartbeat, so that
they can also apply any pending log entries to their state machine. This isn't strictly necessary,
since reads are executed on the leader and nodes have to apply pending entries before becoming
leaders, but we do it anyway so that they don't fall too far behind on application.
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L359-L384
This process is illustrated in the test scripts [`append`](https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/append) and [`heartbeat_commits_follower`](https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/heartbeat_commits_follower)
(along with many other scenarios):
https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/append#L1-L43
https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/heartbeat_commits_follower#L1-L50
## Read Processing
For linearizable (aka strongly consistent) reads, we must execute read requests on the leader, as
mentioned above. However, this is not sufficient: under e.g. a network partition, a node may think
it's still the leader while in fact a different leader has been elected elsewhere (in a later term)
and executed writes there.
To handle this case, the leader must confirm that it is still the leader for each read, by sending a
`Message::Read` to its followers containing a read sequence number. Only if a quorum confirms that
it is still the leader can the read be executed. This incurs an additional network roundtrip, which
is clearly inefficient, so real-world systems often use leader leases instead (see section 6.4.1 of
the Raft _thesis_, not the paper) -- but it's fine for toyDB.
https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/message.rs#L125-L132
When the leader receives the read request, it increments the read sequence number, stores the
pending read request in `reads`, and sends a `Message::Read` to all followers:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L906-L917
When the followers receive the `Message::Read`, they simply respond with a `Message::ReadResponse`
if it's from their current leader (messages from stale terms are ignored):
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L342-L346
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L412-L422
When the leader receives the `Message::ReadResponse` it records it in the peer's `Progress`, and
executes the read once a quorum have confirmed the sequence number:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L860-L866
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L1034-L1066
We now have a Raft-managed state machine with replicated writes and linearizable reads.
---
<p align="center">
← <a href="mvcc.md">MVCC Transactions</a> | <a href="sql.md">SQL Engine</a> →
</p>
================================================
FILE: docs/architecture/server.md
================================================
# Server
Now that we've gone over the individual components, we'll tie them all together in the toyDB
server `toydb::Server`, located in the [`server`](https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs) module.
The server wraps an inner Raft node `raft::Node`, which manages the SQL state machine, and is
responsible for routing network traffic between the Raft node, its Raft peers, and SQL clients.
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L27-L44
For network protocol, the server uses the Bincode encoding that we've discussed in the encoding
section, sent over a TCP connection. There's no need for any further framing, since Bincode knows
how many bytes to expect for each message depending on the type it's decoding into.
The server does not use [async Rust](https://rust-lang.github.io/async-book/) and e.g.
[Tokio](https://tokio.rs), instead opting for regular OS threads. Async Rust can significantly
complicate the code, which would obscure the main concepts, and any efficiency gains would be
entirely irrelevant for toyDB.
Internally in the server, messages are passed around between threads using
[Crossbeam channels](https://docs.rs/crossbeam/latest/crossbeam/channel/index.html).
The main server loop `Server::serve()` listens for inbound TCP connections on port 9705 for Raft
peers and 9605 for SQL clients, and spawns threads to process them. We'll look at Raft and SQL
services separately.
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L66-L110
## Raft Routing
The heart of the server is the Raft processing thread `Server::raft_route()`. This is responsible
for periodically ticking the Raft node via `raft::Node::tick()`, stepping inbound messages from
Raft peers into the node via `raft::Node::step()`, and sending outbound messages to peers.
It also takes inbound Raft client requests from the `sql::engine::Raft` SQL engine, steps them
into the Raft node via `raft::Node::step()`, and passes responses back to the appropriate client
as the node emits them.
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L169-L249
When the node starts up, it spawns a `Server::raft_send_peer()` thread for each Raft peer to send
outbound messages to them.
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L84-L91
These threads continually attempt to connect to the peer via TCP, and then read any outbound
`raft::Envelope(raft::Message)` messages from `Server::raft_route()` via a channel and writes the
messages into the TCP connection using Bincode:
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L146-L167
The server also continually listens for inbound Raft TCP connections from peers in
`Server::raft_accept()`:
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L112-L134
When an inbound connection is accepted, a `Server::raft_receive_peer()` thread is spawned that reads
Bincode-encoded `raft::Envelope(raft::Message)` messages from the TCP connection and sends them to
`Server::raft_route()` via a channel.
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L136-L144
The Raft cluster is now fully connected, and the nodes can all talk to each other.
## SQL Service
Next, let's serve some SQL clients. The SQL service uses the enums `toydb::Request` and
`toydb::Response` as a client protocol, again Bincode-encoded over TCP.
The primary request type is `Request::Execute` which executes a SQL statement against a
`sql::execution::Session` and returns a `sql::execution::StatementResult`, as we've seen previously.
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L312-L337
The server sets up a `sql::engine::Raft` SQL engine, with a Crossbeam channel that's used to send
`raft::Request` Raft client requests to `Server::raft_route()` and onwards to the local
`raft::Node`. It then spawns a `Server::sql_accept()` thread to listen for inbound SQL client
connections:
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L104-L106
When a SQL client connection is accepted, a new client session `sql::execution::Session` is set up
for the client, and we spawn a `Server::sql_session()` thread to serve the connection:
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L251-L272
These session threads continually read `Request` messages from the client, execute them against the
SQL session (and ultimately the Raft node), before sending a `Response` back to the client.
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L274-L309
## `toydb` Binary
The `toydb` binary in `src/bin/toydb.rs` launches the server, and is a thin wrapper around
`toydb::Server`. It is a tiny [`clap`](https://docs.rs/clap/latest/clap/) command:
https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/bin/toydb.rs#L82-L89
It first parses a server configuration from the `toydb.yaml` file:
https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/bin/toydb.rs#L30-L59
Then it initializes the Raft log storage and SQL state machine:
https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/bin/toydb.rs#L105-L133
And finally it launches the `toydb::Server`:
https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/bin/toydb.rs#L135-L137
toyDB is now up and running!
---
<p align="center">
← <a href="sql-execution.md">SQL Execution</a> | <a href="client.md">Client</a> →
</p>
================================================
FILE: docs/architecture/sql-data.md
================================================
# SQL Data Model
The SQL data model represents user data in tables and rows. It is made up of data types and schemas,
in the [`sql::types`](https://github.com/erikgrinaker/toydb/tree/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/types)
module.
## Data Types
toyDB supports four basic scalar data types as `sql::types::DataType`: booleans, integers, floats,
and strings.
https://github.com/erikgrinaker/toydb/blob/b2fe7b76ee634ca6ad31616becabfddb1c03d34b/src/sql/types/value.rs#L15-L27
Specific values are represented as `sql::types::Value`, using the corresponding Rust types. toyDB
also supports SQL `NULL` values, i.e. unknown values, following the rules of
[three-valued logic](https://en.wikipedia.org/wiki/Three-valued_logic).
https://github.com/erikgrinaker/toydb/blob/b2fe7b76ee634ca6ad31616becabfddb1c03d34b/src/sql/types/value.rs#L40-L64
The `Value` type provides basic formatting, conversion, and mathematical operations.
https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/types/value.rs#L68-L79
https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/types/value.rs#L164-L370
It also specifies comparison and ordering semantics, but these are subtly different from the SQL
semantics. For example, in Rust code `Value::Null == Value::Null` yields `true`, while in SQL
`NULL = NULL` yields `NULL`. This mismatch is necessary for the Rust code to properly detect and
process `Null` values, and the desired SQL semantics are implemented during expression evaluation
which we'll cover below.
https://github.com/erikgrinaker/toydb/blob/b2fe7b76ee634ca6ad31616becabfddb1c03d34b/src/sql/types/value.rs#L91-L162
During execution, a row of values is represented as `sql::types::Row`, with multiple rows emitted
via `sql::types::Rows` row iterators:
https://github.com/erikgrinaker/toydb/blob/b2fe7b76ee634ca6ad31616becabfddb1c03d34b/src/sql/types/value.rs#L378-L388
## Schemas
toyDB schemas only support tables. There are no named indexes or constraints, and there's only a
single unnamed database.
Tables are represented by `sql::types::Table`:
https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/types/schema.rs#L12-L25
A table is made up of a set of columns, represented by `sql::types::Column`. These support the data
types described above, along with unique constraints, foreign keys, and secondary indexes.
https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/types/schema.rs#L29-L53
The table name serves as a unique identifier, and can't be changed later. In fact, tables schemas
are entirely static: they can only be created or dropped (there are no schema changes).
Table schemas are stored in the catalog, represented by the `sql::engine::Catalog` trait. We'll
revisit the implementation of this trait in the SQL storage section.
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/engine/engine.rs#L60-L79
Table schemas are validated when created via `Table::validate()`, which enforces invariants and
internal consistency. It uses the catalog to look up information about other tables, e.g. that
foreign key references point to a valid target column in a different table.
https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/types/schema.rs#L98-L170
Table rows are validated via `Table::validate_row()`, which ensures that a `sql::types::Row`
conforms to the schema (e.g. that value types match the column data types). It uses a
`sql::engine::Transaction` to look up other rows in the database, e.g. to check for primary key
conflicts (we'll get back to this later).
https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/types/schema.rs#L172-L236
## Expressions
During SQL execution, we also have to model _expressions_, such as `1 + 2 * 3`. These are
represented as values and operations on them, and can be nested as a tree to represent compound
operations.
https://github.com/erikgrinaker/toydb/blob/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/types/expression.rs#L11-L64
For example, the expression `1 + 2 * 3` (taking [precedence](https://en.wikipedia.org/wiki/Order_of_operations)
into account) is represented as:
```rust
// +
// / \
// 1 *
// / \
// 2 3
Expression::Add(
Expression::Constant(Value::Integer(1)),
Expression::Multiply(
Expression::Constant(Value::Integer(2)),
Expression::Constant(Value::Integer(3)),
),
)
```
An `Expression` can contain two kinds of values: constant values as
`Expression::Constant(sql::types::Value)`, and dynamic values as `Expression::Column(usize)` column
references. The latter will fetch a `sql::types::Value` from a `sql::types::Row` at the specified
index during evaluation.
We'll see later how the SQL parser and planner transforms text expression like `1 + 2 * 3` into an
`Expression`, and how it resolves column names to row indexes like `price * 0.25` to
`row[3] * 0.25`.
Expressions are evaluated recursively via `Expression::evalute()`, given a `sql::types::Row` with
input values for column references, and return a final `sql::types::Value` result:
https://github.com/erikgrinaker/toydb/blob/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/types/expression.rs#L73-L208
Many of the comparison operations like `==` are implemented explicitly here instead of using
`sql::types::Value` comparisons. This is where we implement the SQL semantics of special values like
`NULL`, such that `NULL = NULL` yields `NULL` instead of `TRUE`.
For mathematical operations however, we generally dispatch to these methods on `sql::types::Value`:
https://github.com/erikgrinaker/toydb/blob/b2fe7b76ee634ca6ad31616becabfddb1c03d34b/src/sql/types/value.rs#L185-L295
Expression parsing and evaluation is tested via test scripts in
[`sql/testscripts/expression`](https://github.com/erikgrinaker/toydb/tree/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/testscripts/expressions).
---
<p align="center">
← <a href="sql.md">SQL Engine</a> | <a href="sql-storage.md">SQL Storage</a> →
</p>
================================================
FILE: docs/architecture/sql-execution.md
================================================
# SQL Execution
Now that the planner and optimizer have done all the hard work of figuring out how to execute a
query, it's time to actually execute it.
## Plan Executor
Plan execution is done by `sql::execution::Executor` in the
[`sql::execution`](https://github.com/erikgrinaker/toydb/tree/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/execution)
module, using a `sql::engine::Transaction` to access the SQL storage engine.
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/execution/executor.rs#L14-L49
The executor takes a `sql::planner::Plan` as input, and will return an `ExecutionResult` depending
on the statement type.
https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L331-L339
When executing the plan, the executor will branch off depending on the statement type:
https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L57-L101
We'll focus on `SELECT` queries here, which are the most interesting.
toyDB uses the iterator model (also known as the volcano model) for query execution. In the case of
a `SELECT` query, the result is a row iterator, and pulling from this iterator by calling `next()`
will drive the entire execution pipeline by recursively calling `next()` on the child nodes' row
iterators. This maps very naturally onto Rust's iterators, and we leverage these to construct the
execution pipeline as nested iterators.
Execution itself is fairly straightforward, since we're just doing exactly what the planner tells us
to do in the plan. We call `Executor::execute_node` recursively on each `sql::planner:Node`,
starting with the root node. Each node returns a result row iterator that the parent node can pull
its input rows from, process them, and output the resulting rows via its own row iterator (with the
root node's iterator being returned to the caller):
https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L103-L104
`Executor::execute_node()` will simply look at the type of `Node`, recursively call
`Executor::execute_node()` on any child nodes, and then process the rows accordingly.
https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L103-L212
We won't discuss every plan node in detail, but let's consider the movie plan we've looked at
previously:
```
Select
└─ Order: movies.released desc
└─ Projection: movies.title, movies.released, genres.name as genre
└─ HashJoin: inner on movies.genre_id = genres.id
├─ Scan: movies (released >= 2000)
└─ Scan: genres
```
We'll recursively call `execute_node()` until we end up in the two `Scan` nodes. These simply
call through to the SQL engine (either using Raft or local disk) via `Transaction::scan()`, passing
in the scan predicate if any, and return the resulting row iterator:
https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L203-L204
`HashJoin` will then join the output rows from the `movies` and `genres` iterators by using a
hash join. This builds an in-memory table for `genres` and then iterates over `movies`, joining
the rows:
https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L128-L141
https://github.com/erikgrinaker/toydb/blob/889aef9f24c0fa4d58e314877fa17559a9f3d5d2/src/sql/execution/join.rs#L103-L183
The `Projection` node will simply evaluate the (trivial) column expressions using each joined
row as input:
https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L179-L186
And finally the `Order` node will sort the results (which requires buffering them all in memory):
https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L173-L177
https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L298-L328
The output row iterator of `Order` is returned via `ExecutionResult::Select`, and the caller can now
go ahead and pull the resulting rows from it.
## Session Management
The entry point to the SQL engine is the `sql::execution::Session`, which represents a single user
session. It is obtained via `sql::engine::Engine::session()`.
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/execution/session.rs#L14-L21
The session takes a series of raw SQL statement strings as input and parses them:
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/execution/session.rs#L29-L33
For each statement, it returns a result depending on the kind of statement:
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/execution/session.rs#L132-L148
The session itself performs transaction control. It handles `BEGIN`, `COMMIT`, and `ROLLBACK`
statements, and modifies the transaction accordingly.
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/execution/session.rs#L34-L70
Any other statements are processed by the SQL planner, optimizer, and executor as we've seen in
previous sections.
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/execution/session.rs#L77-L83
These statements are always executed using the session's current transaction. If there is no active
transaction, the session will create a new, implicit transaction for each statement.
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/execution/session.rs#L87-L112
And with that, we have a fully functional SQL engine!
---
<p align="center">
← <a href="sql-optimizer.md">SQL Optimization</a> | <a href="server.md">Server</a> →
</p>
================================================
FILE: docs/architecture/sql-optimizer.md
================================================
# SQL Optimization
[Query optimization](https://en.wikipedia.org/wiki/Query_optimization) attempts to improve query
performance and efficiency by altering the execution plan. This is a deep and complex field, and
we can only scratch the surface here.
toyDB's query optimizer is very basic -- it only has a handful of rudimentary heuristic
optimizations to illustrate how the process works. Real-world optimizers use much more sophisticated
methods, including statistical analysis, cost estimation, adaptive execution, etc.
The optimizers are located in the [`sql::planner::optimizer`](https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs) module.
An optimizer `sql::planner::Optimizer` just takes in a plan node `sql::planner::Node` (the root node
in the plan), and returns an optimized node:
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L20-L25
Optimizations are always implemented as recursive node transformations. To help with this, `Node`
has the helper methods `Node::transform` and `Node::transform_expressions` which recurse into a node
or expression tree and call a given transformation closure on each node, as either
[pre-order](https://en.wikipedia.org/wiki/Tree_traversal#Pre-order,_NLR) or
[post-order](https://en.wikipedia.org/wiki/Tree_traversal#Post-order,_LRN) transforms:
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/plan.rs#L269-L371
A technique that's often useful during optimization is to convert expressions into
[conjunctive normal form](https://en.wikipedia.org/wiki/Conjunctive_normal_form), i.e. "an AND of
ORs". For example, the two following expressions are equivalent, but the latter is in conjunctive
normal form (it's a chain of ANDs):
```
(a AND b) OR (c AND d) → (a OR c) AND (a OR d) AND (b OR c) AND (b OR d)
```
This is useful because we can often move each AND operand independently around in the plan tree
and still get the same result -- we'll see this in action later. Expressions are converted into
conjunctive normal form via `Expression::into_cnf`, which is implemented using
[De Morgan's laws](https://en.wikipedia.org/wiki/De_Morgan%27s_laws):
https://github.com/erikgrinaker/toydb/blob/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/types/expression.rs#L289-L351
We'll have a brief look at all of toyDB's optimizers, which are listed here in the order they're
applied:
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L9-L18
Test scripts for the optimizers are in [`src/sql/testscripts/optimizers`](https://github.com/erikgrinaker/toydb/tree/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/testscripts/optimizers),
and show how query plans evolve as each optimizer is applied.
## Constant Folding
The `ConstantFolding` optimizer performs [constant folding](https://en.wikipedia.org/wiki/Constant_folding).
This pre-evaluates constant expressions in the plan during planning, instead of evaluating them
for every row during execution.
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L27-L30
For example, consider the query `SELECT 1 + 2 * 3 - foo FROM bar`. There is no point in
re-evaluating `1 + 2 * 3` for every row in `bar`, because the result is always the same, so we can
just evaluate this once during planning, transforming the expression into `7 - foo`.
Concretely, this plan:
```
Select
└─ Projection: 1 + 2 * 3 - bar.foo
└─ Scan: bar
```
Should be transformed into this plan:
```
Select
└─ Projection: 7 - bar.foo
└─ Scan: bar
```
To do this, `ConstantFolding` simply checks whether an `Expression` tree contains an
`Expression::Column` node -- if it doesn't, then it much be a constant expression (since that's the
only dynamic value in an expression), and we can evaluate it with a `None` input row and replace the
original expression node with an `Expression::Constant` node.
This is done recursively for each plan node, and recursively for each expression node (so it does
this both for `SELECT`, `WHERE`, `ORDER BY`, and all other parts of the query). Notably, it does a
post-order expression transform, so it starts at the expression leaf nodes and attempts to transform
each expression node as it moves back up the tree -- this allows it to iteratively evaluate constant
parts as far as possible for each branch.
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L32-L56
Additionally, `ConstantFolding` also short-circuits logical expressions. For example, the expression
`foo AND FALSE` will always be `FALSE`, regardless of what `foo` is, so we can replace it with
`FALSE`:
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L58-L84
As the code comment mentions though, this doesn't fold optimally: it doesn't attempt to rearrange
expressions, which would require knowledge of precedence rules. For example, `(1 + foo) - 2` could
be folded into `foo - 1` by first rearranging it as `foo + (1 - 2)`, but we don't do this currently.
## Filter Pushdown
The `FilterPushdown` optimizer attempts to push filter predicates as far down into the plan as
possible, to reduce the number of rows each node has to process.
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L90-L95
Recall the `movies` query plan from the planning section:
```
Select
└─ Order: movies.released desc
└─ Projection: movies.title, movies.released, genres.name as genre
└─ Filter: movies.released >= 2000
└─ NestedLoopJoin: inner on movies.genre_id = genres.id
├─ Scan: movies
└─ Scan: genres
```
Even though we're filtering on `release >= 2000`, the `Scan` node still has to read all of them from
disk and send them via Raft, and the `NestedLoopJoin` node still has to join all of them. It would
be nice if we could push this filtering into the `NestedLoopJoin` and `Scan` nodes and avoid this
extra work, and this is exactly what `FilterPushdown` does.
The only plan nodes that have predicates that can be pushed down are `Filter` nodes and
`NestedLoopJoin` nodes, so we recurse through the plan tree and look for these nodes, attempting
to push down.
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L97-L110
When it encounters the `Filter` node, it will extract the predicate and attempt to push it down
into its `source` node:
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L139-L153
If the source node is a `Filter`, `NestedLoopJoin`, or `Scan` node, then we can push the predicate
down into it by `AND`ing it with the existing predicate (if any).
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L112-L137
In our case, we were able to push the `Filter` into the `NestedLoopJoin`, and our plan now looks
like this:
```
Select
└─ Order: movies.released desc
└─ Projection: movies.title, movies.released, genres.name as genre
└─ NestedLoopJoin: inner on movies.genre_id = genres.id AND movies.released >= 2000
├─ Scan: movies
└─ Scan: genres
```
But we're still not done, as we'd like to push `movies.released >= 2000` down into the `Scan` node.
Pushdown for join nodes is a little more tricky, because we can only push down parts of the
expression that reference one of the source nodes.
We first have to convert the expression into conjunctive normal form, i.e. and AND of ORs, as we've
discussed previously. This allows us to examine and push down each AND part in isolation, because it
has the same effect regardless of whether it is evaluated in the `NestedLoopJoin` node or one of
the source nodes. Our expression is already in conjunctive normal form, though.
We then look at each AND part, and check which side of the join it has column references for. If it
only references one of the sides, then the expression can be pushed down into it. We also make some
effort here to move primary/foreign key constants across to both sides, but we'll gloss over that.
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L155-L247
This allows us to push down the `movies.released >= 2000` predicate into the corresponding `Scan`
node, significantly reducing the amount of data transferred across Raft:
```
Select
└─ Order: movies.released desc
└─ Projection: movies.title, movies.released, genres.name as genre
└─ NestedLoopJoin: inner on movies.genre_id = genres.id
├─ Scan: movies (released >= 2000)
└─ Scan: genres
```
## Index Lookups
The `IndexLookup` optimizer uses primary key or secondary index lookups instead of full table
scans where possible.
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L250-L252
The optimizer itself is fairly straightforward. It assumes that `FilterPushdown` has already pushed
predicates down into `Scan` nodes, so it only needs to examine these. It converts the predicate into
conjunctive normal form, and looks for any parts that are direct column lookups -- i.e.
`column = value` (possibly a long OR chain of these).
If it finds any, and the column is either a primary key or secondary index column, then we convert
the `Scan` node into either a `KeyLookup` or `IndexLookup` node respectively. If there are any
further AND predicates remaining, we add a parent `Filter` node to keep these predicates.
For example, the following plan:
```
Select
└─ Scan: movies ((id = 1 OR id = 7 OR id = 3) AND released >= 2000)
```
Will be transformed into one that does individual key lookups rather than a full table scan:
```
Select
└─ Filter: movies.released >= 2000
└─ KeyLookup: movies (1, 3, 7)
```
The code is as outlined above:
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L254-L303
Helped by `Expression::is_column_lookup()` and `Expression::into_column_values()`:
https://github.com/erikgrinaker/toydb/blob/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/types/expression.rs#L363-L421
## Hash Join
The `HashJoin` optimizer will replace a `NestedLoopJoin` with a `HashJoin` where possible.
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L305-L307
A [nested loop join](https://en.wikipedia.org/wiki/Nested_loop_join) is a very inefficient O(n²)
algorithm, which iterates over all rows in the right source for each row in the left source to see
if they match. However, it is completely general, and can join on arbitraily complex predicates.
In the common case where the join predicate is an equality comparison such as
`movies.genre_id = genres.id` (i.e. an [equijoin](https://en.wikipedia.org/wiki/Relational_algebra#θ-join_and_equijoin)),
then we can instead use a [hash join](https://en.wikipedia.org/wiki/Hash_join). This scans the right
table once, builds an in-memory hash table from it, and for each left row it looks up any right rows
in the hash table. This is a much more efficient O(n) algorithm.
In our previous movie example, we are in fact doing an equijoin:
```
Select
└─ Order: movies.released desc
└─ Projection: movies.title, movies.released, genres.name as genre
└─ NestedLoopJoin: inner on movies.genre_id = genres.id
├─ Scan: movies (released >= 2000)
└─ Scan: genres
```
And so our `NestedLoopJoin` can be replaced by a `HashJoin`:
```
Select
└─ Order: movies.released desc
└─ Projection: movies.title, movies.released, genres.name as genre
└─ HashJoin: inner on movies.genre_id = genres.id
├─ Scan: movies (released >= 2000)
└─ Scan: genres
```
The `HashJoin` optimizer is extremely simple: if the join predicate is an equijoin, use a hash join.
This isn't always a good idea (the right source can be huge and we can run out of memory for the
hash table), but we keep it simple.
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L309-L348
Of course there are many other join algorithms out there, and one of the harder problems in SQL
optimization is how to efficiently perform large N-way multijoins. We don't attempt to tackle these
problems here -- the `HashJoin` optimizer is just a very simple example of such join optimization.
## Short Circuiting
The `ShortCircuit` optimizer tries to find nodes that can't possibly do any useful work, and either
removes them from the plan, or replaces them with trivial nodes that don't do anything. It is kind
of similar to the `ConstantFolding` optimizer in spirit, but works on plan nodes rather than
expression nodes.
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L350-L354
For example, `Filter` nodes with a `TRUE` predicate won't actually filter anything:
```
Select
└─ Filter: true
└─ Scan: movies
```
So we can just remove them:
```
Select
└─ Scan: movies
```
Similarly, `Filter` nodes with a `FALSE` predicate will never emit anything:
```
Select
└─ Filter: false
└─ Scan: movies
```
There's no point doing a scan in this case, so we can just replace it with a `Nothing` node that
does no work and doesn't emit anything:
```
Select
└─ Nothing
```
The optimizer tries to find a bunch of such patterns. This can also tidy up query plans a fair bit
by removing unnecessary cruft.
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L356-L438
---
<p align="center">
← <a href="sql-planner.md">SQL Planning</a> | <a href="sql-execution.md">SQL Execution</a> →
</p>
================================================
FILE: docs/architecture/sql-parser.md
================================================
# SQL Parsing
We finally arrive at SQL. The SQL parser is the first stage in processing SQL queries and
statements, located in the [`sql::parser`](https://github.com/erikgrinaker/toydb/tree/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser)
module.
The SQL parser's job is to take a raw SQL string and turn it into a structured form that's more
convenient to work with. In doing so, it will validate that the string is in fact valid SQL
_syntax_. However, it doesn't know if the SQL statement actually makes sense -- it has no idea which
tables or columns exist, what their data types are, and so on. That's the job of the planner, which
we'll look at later.
For example, let's say the parser is given the following SQL query:
```sql
SELECT name, price, price * 25 / 100 AS vat
FROM products JOIN categories ON products.category_id = categories.id
WHERE categories.code = 'BLURAY' AND stock > 0
ORDER BY price DESC
LIMIT 10
```
It will generate a structure that looks something like this (in simplified syntax):
```rust
// A SELECT statement.
Statement::Select {
// SELECT name, price, price * 25 / 100 AS vat
select: [
(Column("name"), None),
(Column("price"), None),
(
Divide(
Multiply(Column("price"), Integer(25)),
Integer(100)
),
Some("vat"),
),
]
// FROM products JOIN categories ON products.category_id = categories.id
from: [
Join {
left: Table("products"),
right: Table("categories"),
type: Inner,
predicate: Some(
Equal(
Column("products.category_id)",
Column("categories.id"),
)
)
}
]
// WHERE categories.code = 'BLURAY' AND stock > 0
where: Some(
And(
Equal(
Column("categories.code"),
String("BLURAY"),
),
GreaterThan(
Column("stock"),
Integer(0),
)
)
)
// ORDER BY price DESC
order: [
(Column("price"), Descending),
]
// LIMIT 10
limit: Some(Integer(10))
}
```
Let's have a look at how this happens.
## Lexer
We begin with the `sql::parser::Lexer`, which takes the raw SQL string and performs
[lexical analysis](https://en.wikipedia.org/wiki/Lexical_analysis) to convert it into a sequence of
tokens. These tokens are things like number, string, identifier, SQL keyword, and so on.
This preprocessing is useful to deal with some of the "noise" of SQL text, such as whitespace,
string quotes, identifier normalization, and so on. It also specifies which symbols and keywords are
valid in our SQL queries. This makes the parser's life a lot easier.
The lexer doesn't care about SQL structure at all, only that the individual pieces (tokens) of a
string are well-formed. For example, the following input string:
```
'foo' ) 3.14 SELECT + x
```
Will result in these tokens:
```
String("foo") CloseParen Number("3.14") Keyword(Select) Plus Ident("x")
```
Tokens and keywords are represented by the `sql::parser::Token` and `sql::parser::Keyword` enums
respectively:
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/lexer.rs#L8-L47
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/lexer.rs#L86-L155
The lexer takes an input string and emits tokens as an iterator:
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/lexer.rs#L311-L337
It does this by repeatedly attempting to scan the next token until it reaches the end of the string
(or errors). It can determine the kind of token by looking at the first character:
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/lexer.rs#L358-L373
And then scan across the following characters as appropriate to generate a valid token. For example,
this is how a quoted string (e.g. `'foo'`) is lexed into a `Token::String` (including handling of
any escaped quotes inside the string):
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/lexer.rs#L435-L451
These tokens become the input to the parser.
## Abstract Syntax Tree
The end result of the parsing process will be an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree)
(AST), which is a structured representation of a SQL statement, located in the
[`sql::parser::ast`](https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/ast.rs) module.
The root of this tree is the `sql::parser::ast::Statement` enum, which represents all the different
kinds of SQL statements that we support, along with their contents:
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/ast.rs#L6-L145
The nested tree structure is particularly apparent with expressions, which represent values and
operations on them. For example, the expression `2 * 3 - 4 / 2`, which evaluates to the value `4`.
We've seen in the data model section how such expressions are represented as
`sql::types::Expression`, but before we get there we have to parse them. The parser has its own
representation `sql::parser::ast::Expression` -- this is necessary e.g. because in the AST, we
represent columns as names rather than numeric indexes (we don't know yet which columns exist or
what their names are, we'll get to that during planning).
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/ast.rs#L147-L170
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/ast.rs#L204-L234
For example, `2 * 3 - 4 / 2` is represented as:
```rust
Expression::Operator(Operator::Subtract(
// The left-hand operand of -
Expression::Operator(Operator::Multiply(
// The left-hand operand of *
Expression::Literal(Literal::Integer(2)),
// The right-hand operand of *
Expression::Literal(Literal::Integer(3)),
)),
// The right-hand operand of -
Expression::Operator(Operator::Divide(
// The left-hand operand of /
Expression::Literal(Literal::Integer(4)),
// The right-hand operand of /
Expression::Literal(Literal::Integer(2)),
)),
))
```
## Parser
The parser, `sql::parser::Parser`, takes lexer tokens as input and builds an `ast::Statement`
from them:
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L9-L32
We can determine the kind of statement we're parsing simply by looking at the first keyword:
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L109-L130
Let's see how a `SELECT` statement is parsed. The different clauses in a `SELECT` (e.g. `FROM`,
`WHERE`, etc.) must always be given in a specific order, and they always begin with the appropriate
keyword, so we can simply try to parse each clause in the expected order:
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L330-L342
Parsing each clause is also just a matter of parsing the expected parts in order. For example, the
initial `SELECT` clause is just a comma-separated list of expressions with an optional alias:
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L344-L365
The `FROM` clause is a comma-separated list of table name, optionally joined with other tables:
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L367-L427
And the `WHERE` clause is just a predicate expression to filter by:
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L429-L435
Expression parsing is where this gets tricky, because we have to respect the rules of operator
precedence and associativity. For example, according to mathematical order of operations (aka
"PEMDAS") the expression `2 * 3 - 4 / 2` must be parsed as `(2 * 3) - (4 / 2)` which yields 4, not
`2 * (3 - 4) / 2` which yields -1.
toyDB does this using the [precedence climbing algorithm](https://en.wikipedia.org/wiki/Operator-precedence_parser#Precedence_climbing_method),
which is a fairly simple and compact algorithm as far as these things go. In a nutshell, it will
greedily and recursively group operators together as long as their precedence is the same or higher
than that of the operators preceding them (hence "precedence climbing"). For example:
```
----- ----- Precedence 2: * and /
------------- Precedence 1: -
2 * 3 - 4 / 2
```
The algorithm is documented in more detail on `Parser::parse_expression()`:
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L501-L696
---
<p align="center">
← <a href="sql-raft.md">SQL Raft Replication</a> | <a href="sql-planner.md">SQL Planning</a> →
</p>
================================================
FILE: docs/architecture/sql-planner.md
================================================
# SQL Planning
The SQL planner in the [`sql::planner`](https://github.com/erikgrinaker/toydb/tree/c64012e29c5712d6fe028d3d5375a98b8faea266/src/sql/planner)
module takes a SQL statement AST from the parser and generates an execution plan for it. We won't
actually execute it just yet though, only figure out how to execute it.
## Execution Plan
A plan is represented by the `sql::planner::Plan` enum. The variant specifies the operation to
execute (e.g. `SELECT`, `INSERT`, `UPDATE`, `DELETE`):
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/plan.rs#L15-L73
Below the root, the plan is typically made of up of a tree of nested `sql::planner::Node`. Each node
emits a stream of SQL rows as output, and may take streams of input rows from child nodes.
https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/plan.rs#L106-L175
Here is an example, taken from the `Plan` code comment above:
```sql
SELECT title, released, genres.name AS genre
FROM movies INNER JOIN genres ON movies.genre_id = genres.id
WHERE released >= 2000
ORDER BY released
```
Which results in this query plan:
```
Select
└─ Order: movies.released desc
└─ Projection: movies.title, movies.released, genres.name as genre
└─ Filter: movies.released >= 2000
└─ NestedLoopJoin: inner on movies.genre_id = genres.id
├─ Scan: movies
└─ Scan: genres
```
Rows flow from the tree leaves to the root:
1. `Scan` nodes read rows from the tables `movies` and `genres`.
2. `NestedLoopJoin` joins the rows from `movies` and `genres`.
3. `Filter` discards rows with release dates older than 2000.
4. `Projection` picks out the requested column values from the rows.
5. `Order` sorts the rows by release date.
6. `Select` returns the final rows to the client.
## Scope and Name Resolution
One of the main jobs of the planner is to resolve column names to column indexes in the input rows
of each node.
In the query example above, the `WHERE released >= 2000` filter may refer to a column `released`
from either the joined `movies` table or the `genres` tables. The planner needs to figure out which
table has a `released` column, and also figure out which column number in the `NestedLoopJoin`
output rows corresponds to the `released` column (for example column number 2).
This job is further complicated by the fact that many nodes can alias, reorder, or drop columns,
and some nodes may also refer to columns that shouldn't be part of the result at all (for example,
it's possible to `ORDER BY` a column that won't be output by a `SELECT` projection at all, but
the `Order` node still needs access to the column data to sort by it).
The planner uses a `sql::planner::Scope` to keep track of which column names are currently visible,
and which column indexes they refer to. For each node the planner builds, starting from the leaves,
it creates a new `Scope` that contains the currently visible columns, tracking how they are modified
and rearranged by each node.
https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L577-L610
When an AST expression refers to a column name, the planner can use `Scope::lookup_column()` to find
out which column number the expression should take its input value from.
https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L660-L686
## Planner
The planner itself is `sql:planner::Planner`. It uses a `sql::engine::Catalog` to look up
information about tables and columns from storage.
https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L12-L20
To build an execution plan, the planner first looks at the `ast::Statement` kind to determine
what kind of plan to build:
https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L28-L47
Let's build this `SELECT` plan from above:
```sql
SELECT title, released, genres.name AS genre
FROM movies INNER JOIN genres ON movies.genre_id = genres.id
WHERE released >= 2000
ORDER BY released
```
Which should result in this plan:
```
Select
└─ Order: movies.released desc
└─ Projection: movies.title, movies.released, genres.name as genre
└─ Filter: movies.released >= 2000
└─ NestedLoopJoin: inner on movies.genre_id = genres.id
├─ Scan: movies
└─ Scan: genres
```
The planner is given the following (simplified) AST from the parser as input:
```rust
// A SELECT statement.
Statement::Select {
// SELECT title, released, genres.name AS genre
select: [
(Column("title"), None),
(Column("released"), None),
(Column("genres.name"), "genre"),
]
// FROM movies INNER JOIN genres ON movies.genre_id = genres.id
from: [
Join {
left: Table("movies"),
right: Table("genres"),
type: Inner,
predicate: Some(
Equal(
Column("movies.genre_id"),
Column("genres.id"),
)
)
}
]
// WHERE released >= 2000
where: Some(
GreaterThanOrEqual(
Column("released"),
Integer(2000),
)
)
// ORDER BY released
order: [
(Column("released"), Ascending),
]
}
```
The first thing `Planner::build_select` does is to create an empty scope (which will track column
names and indexes) and build the `FROM` clause which will generate the initial input rows:
https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L170-L179
https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L283-L289
`Planner::build_from()` first encounters the `ast::From::Join` item, which joins `movies` and
`genres`. This will build a `Node::NestedLoopJoin` plan node for the join, which is the simplest and
most straightforward join algorithm -- it simply iterates over all rows in the `genres` table for
every row in the `movies` table and emits the joined rows (we'll see how to optimize it with a
better join algorithm later).
https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L319-L344
It first recurses into `Planner::build_from()` to build each of the `ast::From::Table` nodes for
each table. This will look up the table schemas in the catalog, add them to the current scope, and
build a `Node::Scan` node which will emit all rows from each table. The `Node::Scan` nodes are
placed into the `Node::NestedLoopJoin` above.
https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L312-L317
While building the `Node::NestedLoopJoin`, it also needs to convert the join expression
`movies.genre_id = genres.id` into a proper `sql::types::Expression`. This is done by
`Planner::build_expression()`:
https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L493-L568
Expression building is mostly a direct translation from an `ast::Expression` variant to a
corresponding `sql::types::Expression` variant (for example from
`ast::Expression::Operator(ast::Operator::Equal)` to `sql::types::Expression::Equal`). However, as
mentioned earlier, `ast::Expression` contains column references by name, while
`sql::types::Expression` contains column references as row indexes. This name resolution is done
here, by looking up the column names in the scope:
https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L521-L523
The expression we're building is the join predicate of `Node::NestedLoopJoin`, so it operates on
joined rows containing all columns of `movies` then all columns of `genres`. It also operates on all
combinations of joined rows (the [Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product)),
and the purpose of the join predicate is to determine which joined rows to actually keep. For
example, the full set of joined rows that are evaluated might be:
| movies.id | movies.title | movies.released | movies.genre_id | genres.id | genres.name |
|-----------|--------------|-----------------|-----------------|-----------|-------------|
| 1 | Sicario | 2015 | 2 | 1 | Drama |
| 2 | Sicario | 2015 | 2 | 2 | Action |
| 3 | 21 Grams | 2003 | 1 | 1 | Drama |
| 4 | 21 Grams | 2003 | 1 | 2 | Action |
| 5 | Heat | 1995 | 2 | 1 | Drama |
| 6 | Heat | 1995 | 2 | 2 | Action |
The join predicate should pick out the rows where `movies.genre_id = genres.id`. The scope will
reflect the column layout in the example above, and can resolve the column names to zero-based row
indexes as `#3 = #4`, which will be the final built `Expression`.
Now that we've built the `FROM` clause into a `Node::NestedLoopJoin` of two `Node::Scan` nodes, we
move on to the `WHERE` clause. This simply builds the `WHERE` expression `released >= 2000`, like
we've already seen with the join predicate, and creates a `Node::Filter` node which takes its input
rows from the `Node::NestedLoopJoin` and filters them by the given expression. Again, the scope
keeps track of which input columns we're getting from the join node and resolves the `released`
column reference in the expression.
https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L202-L206
We then build the `SELECT` clause, which emits the `title, released, genres.name AS genre` columns.
This is just a list of expressions that are built in the current scope and placed into a
`Node::Projection` (the expressions could be arbitrarily complex). However, we also have to make
sure to update the scope with the final three columns that are output to subsequent nodes, taking
into account the `genre` alias for the original `genres.name` column (we won't dwell on the "hidden
columns" mentioned there -- they're not relevant for our query).
https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L214-L234
Finally, we build the `ORDER BY` clause. Again, this just builds a trivial expression for `released`
and places it into an `Node::Order` node which takes input rows from the `Node::Projection` and
sorts them by the order expression.
https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L245-L252
And that's it. The `Node::Order` is placed into the root `Plan::Select`, and we have our final plan.
```
Select
└─ Order: movies.released desc
└─ Projection: movies.title, movies.released, genres.name as genre
└─ Filter: movies.released >= 2000
└─ NestedLoopJoin: inner on movies.genre_id = genres.id
├─ Scan: movies
└─ Scan: genres
```
We'll see how to execute it soon, but first we should optimize it to see if we can make it run
faster -- in particular, to see if we can avoid reading all movies from storage, and if we can do
better than the very slow nested loop join.
---
<p align="center">
← <a href="sql-parser.md">SQL Parsing</a> | <a href="sql-optimizer.md">SQL Optimization</a> →
</p>
================================================
FILE: docs/architecture/sql-raft.md
================================================
# SQL Raft Replication
toyDB uses Raft to replicate SQL storage across a cluster of nodes (see the Raft section for
details). All nodes will store a full copy of the SQL database, and the Raft leader will replicate
writes across nodes and execute reads.
Recall the Raft state machine interface `raft::State`:
https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/state.rs#L4-L51
In toyDB, the state machine is just a `sql::engine::Local` storage engine with a thin wrapper:
https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L278-L291
Raft will submit read and write commands to this state machine as binary `Vec<u8>` data, so we have
to represent the methods of `sql::engine::Engine` as binary Raft commands. We do this as two
enums, `sql::engine::raft::Read` and `sql::engine::raft::Write`, which we'll Bincode-encode:
https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L16-L71
Notice that almost all requests include a `mvcc::TransactionState`. Most of the useful methods of
`sql::engine::Engine` are on the `sql::engine::Transaction`, but unlike the `Local` engine, below
Raft we can't hold on to a `Transaction` object in memory between each command -- nodes may restart
and leadership may move, and we want client transactions to keep working despite this. Instead, we
will use the client-supplied `mvcc::TransactionState` to reconstruct a `Transaction` for every
command via `mvcc::Transaction::resume()` and call methods on it.
When the state machine receives a write command, it decodes it as a `Write` and calls the
appropriate `Local` method. The result is Bincode-encoded and returned to the caller, who knows what
return type to expect for a given command. The state machine also keeps track of the Raft applied
index of each command as a separate key in the key/value store.
https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L346-L367
https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L306-L338
Similarly, read commands are decoded as a `Read` and the appropriate `Local` method is called:
https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L369-L404
That's the state machine running below Raft. But how do we actually send these commands to Raft and
receive results? That's handled by the `sql::engine::Raft` implementation, which uses a channel to
send requests to the local Raft node (we'll see how this plumbing works in the server section):
https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L80-L95
The channel takes a `raft::Request` containing binary Raft client requests and a return channel
where the Raft node can send back a `raft::Response`. The Raft engine has a few convenience methods
to send requests and receive responses, for both read and write requests:
https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L114-L135
And the implementation of the `sql::engine::Engine` and `sql::engine::Transaction` traits simply
send these requests via Raft:
https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L194-L276
One thing to note here is that we don't support streaming data via Raft, so e.g. the
`Transaction::scan` method will buffer the entire result in a `Vec`. With a full table scan, this
will load the entire table into memory -- that's unfortunate, but we keep it simple.
To summarize, this is what happens when `Transaction::insert()` is called to insert a row via Raft:
1. `sql::engine::raft::Transaction::insert()`: called to insert a row.
2. `sql::engine::raft::Write::Insert`: enum representation of the insert command.
3. `raft::Request::Write`: raft request containing the Bincode-encoded `Write::Insert` command.
4. `sql::engine::raft::Engine::tx`: sends the `Request::Write` and response channel to Raft.
5. `raft::Node::step()`: the `Request::Write` is given to Raft in a `Message::ClientRequest`.
6. Raft does its replication thing, and commits the command's log entry.
7. `raft::State::apply()`: the Bincode-encoded `Write::Insert` is passed to the state machine.
8. `sql::engine::raft::State::apply()`: decodes the command to a `Write::Insert`.
9. `sql::engine::raft::State::local`: contains the `Local` engine on each node.
10. `sql::engine::local::Engine::resume()`: called to obtain the SQL/MVCC transaction.
11. `sql::engine::local::Transaction::insert()`: the row is inserted to the local engine.
12. `raft::RawNode::tx`: the `Ok(())` result is sent as a Bincode-encoded `Message::ClientResponse`.
13. `sql::engine::raft::Transaction::insert()`: receives the result and returns it to the caller.
The plumbing here will be covered in more details in the server section.
---
<p align="center">
← <a href="sql-storage.md">SQL Storage</a> | <a href="sql-parser.md">SQL Parsing</a> →
</p>
================================================
FILE: docs/architecture/sql-storage.md
================================================
# SQL Storage
The SQL storage engine, in the [`sql::engine`](https://github.com/erikgrinaker/toydb/tree/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/engine)
module, stores tables and rows. toyDB has two SQL storage implementations:
* `sql::engine::Local`: local storage using a `storage::Engine` key/value store.
* `sql::engine::Raft`: Raft-replicated storage, using `Local` on each node below Raft.
These implement the `sql::engine::Engine` trait, which specifies the SQL storage API. SQL execution
can use either simple local storage or Raft-replicated storage -- toyDB itself always uses the
Raft-replicated engine, but many tests use a local in-memory engine.
The `sql::engine::Engine` trait is fully transactional, based on the `storage::MVCC` transaction
engine discussed previously. As such, the trait just has a few methods that begin transactions --
the storage logic itself is implemented in the transaction, which we'll cover in next. The trait
also has a `session()` method to start SQL sessions for query execution, which we'll revisit in the
execution section.
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/engine/engine.rs#L9-L29
Here, we'll only look at the `Local` engine, and we'll discuss Raft replication afterwards. `Local`
itself is just a thin wrapper around a `storage::MVCC<storage::Engine>` to create transactions:
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L50-L97
## Key/Value Representation
`Local` uses a `storage::Engine` key/value store to store SQL table schemas, table rows, and
secondary index entries. But how do we represent these as keys and values?
The keys are represented by the `sql::engine::Key` enum, and encoded using the Keycode encoding
that we've discussed in the encoding section:
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L15-L31
The values are encoded using the Bincode encoding, where the value type is given by the key:
* `Key::Table` → `sql::types::Table` (table schemas)
* `Key::Index` → `BTreeSet<sql::types::Value>` (indexed primary keys)
* `Key::Row` → `sql::types::Row` (table rows)
Recall that the Keycode encoding will store keys in sorted order. This means that all `Key::Table`
entries come first, then all `Key::Index`, then all `Key::Row`. These are further grouped and
sorted by their fields.
For example, consider these SQL tables containing movies and genres, with a secondary index on
`movies.genre_id` for fast lookups of movies with a given genre:
```sql
CREATE TABLE genres (
id INTEGER PRIMARY KEY,
name STRING NOT NULL
);
CREATE TABLE movies (
id INTEGER PRIMARY KEY,
title STRING NOT NULL,
released INTEGER NOT NULL,
genre_id INTEGER NOT NULL INDEX REFERENCES genres
);
INSERT INTO genres VALUES (1, 'Drama'), (2, 'Action');
INSERT INTO movies VALUES
(1, 'Sicario', 2015, 2),
(2, '21 Grams', 2003, 1),
(3, 'Heat', 1995, 2);
```
This would result in the following illustrated keys and values, in the given order:
```
/Table/genres → Table { name: "genres", primary_key: 0, columns: ... }
/Table/movies → Table { name: "movies", primary_key: 0, columns: ... }
/Index/movies/genre_id/Integer(1) → BTreeSet { Integer(2) }
/Index/movies/genre_id/Integer(2) → BTreeSet { Integer(1), Integer(3) }
/Row/genres/Integer(1) → Row { Integer(1), String("Action") }
/Row/genres/Integer(2) → Row { Integer(2), String("Drama") }
/Row/movies/Integer(1) → Row { Integer(1), String("Sicario"), Integer(2015), Integer(2) }
/Row/movies/Integer(2) → Row { Integer(2), String("21 Grams"), Integer(2003), Integer(1) }
/Row/movies/Integer(3) → Row { Integer(3), String("Heat"), Integer(1995), Integer(2) }
```
Thus, if we want to do a full table scan of the `movies` table, we just do a prefix scan of
`/Row/movies/`. If we want to do a secondary index lookup of all movies with `genre_id = 2`, we
fetch `/Index/movies/genre_id/Integer(2)` and find that movies with `id = {1,3}` have this genre.
To help with prefix scans, the valid key prefixes are represented as `sql::engine::KeyPrefix`:
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L35-L48
For a look at the actual on-disk binary storage format, see the test scripts under
[`src/sql/testscripts/writes`](https://github.com/erikgrinaker/toydb/tree/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/testscripts/writes),
which output the logical and raw binary representation of write operations.
## Schema Catalog
The `sql::engine::Catalog` trait is used to store table schemas, i.e. `sql::types::Table`. It has a
handful of methods for creating, dropping and fetching tables (recall that toyDB does not support
schema changes). The `Table::name` field is used as a unique table identifier throughout.
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/engine/engine.rs#L60-L79
The `Catalog` trait is also fully transactional, as it must be implemented on a transaction via the
`type Transaction: Transaction + Catalog` trait bound on `sql::engine::Engine`.
Creating a table is straightforward: insert a key/value pair with a Keycode-encoded `Key::Table`
for the key and a Bincode-encoded `sql::types::Table` for the value. We first check that the
table doesn't already exist, and validate the table schema using `Table::validate()`.
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L340-L347
Similarly, fetching and listing tables is straightforward: just key/value gets or scans using the
appropriate keys.
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L390-L399
Dropping tables is a bit more involved, since we have to perform some validation and also delete the
actual table rows and any secondary index entries, but it's not terribly complicated:
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L349-L388
## Row Storage and Transactions
The workhorse of the SQL storage engine is the `Transaction` trait, which provides
[CRUD](https://en.wikipedia.org/wiki/Create,_read,_update_and_delete) operations (create, read,
update, delete) on table rows and secondary index entries. For performance (especially with Raft),
it operates on row batches rather than individual rows.
https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/engine/engine.rs#L31-L58
The `Local::Transaction` implementation is just a wrapper around an MVCC transaction, and the
commit/rollback methods just call straight through to it:
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L99-L102
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L182-L192
To insert new rows into a table, we first have to perform some validation: check that the table
exists and validate the rows against the table schema (including checking for e.g. primary key
conflicts and foreign key references). We then store the rows as a key/value pairs, using a
`Key::Row` with the table name and primary key value. And finally, we update secondary index entries
(if any).
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L252-L268
Row updates are similar to inserts, but in the case of a primary key change we instead delete the
old row and insert a new one, for simplicity. Secondary index updates also have to update both the
old and new entries.
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L296-L337
Row deletions are also similar: validate that the deletion is safe (e.g. check that there are no
foreign key references to it), then delete the `Key::Row` keys and any secondary index entries:
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L194-L246
To fetch rows by primary key, we simply call through to key/value gets using the appropriate
`Key::Row`:
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L248-L250
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L127-L133
Similarly, index lookups fetch a `Key::Index` for the indexed value, returning matching primary
keys:
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L270-L273
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L115-L125
Scanning table rows just performs a prefix scan with the appropriate `KeyPrefix::Row`, returning a
row iterator. This can optionally also do row filtering via filter pushdowns, which we'll revisit
when we look at the SQL optimizer.
https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L275-L294
And with that, we can now store and retrieve SQL tables and rows on disk. Let's see how to replicate
it across nodes via Raft.
---
<p align="center">
← <a href="sql-data.md">SQL Data Model</a> | <a href="sql-raft.md">SQL Raft Replication</a> →
</p>
================================================
FILE: docs/architecture/sql.md
================================================
# SQL Engine
The SQL engine provides support for the SQL query language, and is the main database interface. It
uses a key/value store for data storage, MVCC for transactions, and Raft for replication. The SQL
engine itself consists of several distinct components that form a pipeline:
> Client → Session → Lexer → Parser → Planner → Optimizer → Executor → Storage
The SQL engine is located in the [`sql`](https://github.com/erikgrinaker/toydb/tree/b2fe7b76ee634ca6ad31616becabfddb1c03d34b/src/sql)
module. We'll discuss each of the components in a bottom-up manner.
The SQL engine is tested as a whole by test scripts under
[`src/sql/testscripts`](https://github.com/erikgrinaker/toydb/tree/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/testscripts).
These typically take a raw SQL string as input, execute them against an in-memory storage engine,
and output the result along with intermediate state such as the query plan, storage operations,
and binary key/value data.
---
<p align="center">
← <a href="raft.md">Raft Consensus</a> | <a href="sql-data.md">SQL Data Model</a> →
</p>
================================================
FILE: docs/architecture/storage.md
================================================
# Storage Engine
toyDB uses an embedded [key/value store](https://en.wikipedia.org/wiki/Key–value_database) for data
storage, located in the [`storage`](https://github.com/erikgrinaker/toydb/tree/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/storage)
module. This stores arbitrary keys and values as binary byte strings. The storage engine doesn't
know or care what the keys and values contain -- we'll see later how the SQL data model, with tables
and rows, is mapped onto this key/value structure.
The storage engine supports simple set/get/delete operations on individual keys. It does not itself
support transactions -- this is built on top, and we'll get back to it shortly.
Keys are stored in sorted order. This allows range scans, where we can iterate over all key/value
pairs between two specific keys, or with a specific key prefix. This will be needed by other
components in the system, e.g. to scan all rows in a specific SQL table, to scan all versions of an
MVCC key, to scan the tail of the Raft log, etc.
The storage engine is pluggable: there are multiple implementations, and the user can choose which
one to use in the config file. These implement the `storage::Engine` trait:
https://github.com/erikgrinaker/toydb/blob/4804df254034c51f367d1380d389d80695cd7054/src/storage/engine.rs#L8-L58
Let's look at the existing storage engine implementations.
## `Memory` Storage Engine
The simplest storage engine is the `storage::Memory` engine. This is a trivial implementation which
stores data in memory using the Rust standard library's
[`BTreeMap`](https://doc.rust-lang.org/std/collections/struct.BTreeMap.html), without persisting
it to disk. It is primarily used for testing.
Since this is just a wrapper around the `BTreeMap` we can include it in its entirety here:
https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/memory.rs#L8-L77
## `BitCask` Storage Engine
The main storage engine is `storage::BitCask`. This is a very simple variant of
[BitCask](https://riak.com/assets/bitcask-intro.pdf), used in the [Riak](https://riak.com/)
database. It is kind of like the [LSM-tree](https://en.wikipedia.org/wiki/Log-structured_merge-tree)'s
baby cousin.
https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L15-L55
toyDB's BitCask implementation uses a single append-only log file for storage. To write a key/value
pair, we simply append it to the file. To delete a key, we append a special tombstone value. When
reading a key, the last entry for that key in the file is used.
The file format for a key/value pair is simply:
1. The key length, as a big-endian `u32` (4 bytes).
2. The value length, as a big-endian `i32` (4 bytes). -1 if tombstone.
3. The binary key (n bytes).
4. The binary value (n bytes).
For example, the key/value pair `foo=bar` would be written as follows (in hexadecimal):
```
keylen valuelen key value
00000003 00000003 666f6f 626172
```
Because the data file is a simple log, we don't need a separate [write-ahead log](https://en.wikipedia.org/wiki/Write-ahead_logging)
for crash recovery -- the data file _is_ the write-ahead log.
To quickly look up key/value pairs when reading, we maintain an in-memory `KeyDir` index which maps
a key to the latest value's position in the file. All keys must therefore fit in memory.
https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L57-L65
We initially generate this index by scanning through the entire file when it is opened:
https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L267-L332
To write a key, we append it to the file and update the `KeyDir`:
https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L155-L159
https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L342-L366
To delete a key, we append a tombstone value instead:
https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L122-L126
To read a value for a key, we look up the key's file location in the `KeyDir` index (if the key
exists), and then read it from the file:
https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L334-L340
The `KeyDir` uses an inner stdlib `BTreeMap` to keep track of keys. This allows range scans, where
we iterate over a sorted set of keys between the range bounds, loading each key from the file:
https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L144-L146
https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L207-L225
As keys are updated and deleted, we'll keep accumulating old versions in the log file. To remove
these, the log file is compacted on startup. This writes out the latest value of every live
key/value pair to a new file, and replaces the old file. The keys are written in sorted order, to
make later scans faster.
https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L172-L195
---
<p align="center">
← <a href="overview.md">Overview</a> | <a href="encoding.md">Key/Value Encoding</a> →
</p>
================================================
FILE: docs/architecture.md
================================================
Moved to [`architecture/index.md`](architecture/index.md).
================================================
FILE: docs/crate/Cargo.toml
================================================
[package]
name = "toydb"
version = "1.0.1"
description = "A simple distributed SQL database, built for education"
authors = ["Erik Grinaker <erik@grinaker.org>"]
license = "Apache-2.0"
homepage = "https://github.com/erikgrinaker/toydb"
repository = "https://github.com/erikgrinaker/toydb"
edition = "2024"
================================================
FILE: docs/crate/README.md
================================================
# toyDB
toyDB is a distributed SQL database in Rust, built from scratch as an educational project. Main
features:
* Raft distributed consensus for linearizable state machine replication.
* ACID transactions with MVCC-based snapshot isolation.
* Pluggable storage engine with BitCask and in-memory backends.
* Iterator-based query engine with heuristic optimization and time-travel support.
* SQL interface including joins, aggregates, and transactions.
toyDB is not distributed as a crate, see <https://github.com/erikgrinaker/toydb> for more.
This crate used to contain the [joydb](https://crates.io/crates/joydb) database. Thanks to Serhii
Potapov for donating the crate name.
================================================
FILE: docs/crate/src/lib.rs
================================================
//! This crate is just a simple README.md placeholder. toydb is not intended to be used as a
//! library, and is not distributed as a crate. See <https://github.com/erikgrinaker/toydb>.
================================================
FILE: docs/examples.md
================================================
# SQL Examples
The following examples demonstrate some of toyDB's SQL features. For more details, see the
[SQL reference](sql.md).
- [Setup](#setup)
- [Creating Tables and Data](#creating-tables-and-data)
- [Constraints and Referential Integrity](#constraints-and-referential-integrity)
- [Basic SQL Queries](#basic-sql-queries)
- [Expressions](#expressions)
- [Joins](#joins)
- [Explain](#explain)
- [Aggregates](#aggregates)
- [Transactions](#transactions)
- [Time-Travel Queries](#time-travel-queries)
## Setup
To start a five-node cluster on the local machine (requires a working
[Rust compiler](https://www.rust-lang.org/tools/install)), run:
```
$ ./cluster/run.sh
toydb2 19:06:28 [ INFO] Listening on 0.0.0.0:9602 (SQL) and 0.0.0.0:9702 (Raft)
toydb2 19:06:28 [ERROR] Failed connecting to Raft peer 127.0.0.1:9705: Connection refused
toydb5 19:06:28 [ INFO] Listening on 0.0.0.0:9605 (SQL) and 0.0.0.0:9705 (Raft)
[...]
toydb5 19:06:29 [ INFO] Voting for toydb-d in term 1 election
toydb3 19:06:29 [ INFO] Voting for toydb-d in term 1 election
toydb4 19:06:29 [ INFO] Won election for term 1, becoming leader
```
In a separate terminal, start a `toysql` client and check the server status:
```
$ cargo run --release --bin toysql
Connected to toyDB node "toydb-a". Enter !help for instructions.
toydb> !status
Server: 5 (leader 4 in term 1 with 5 nodes)
Raft log: 1 committed, 0 applied, 0.000 MB (hybrid storage)
Node logs: 1:1 2:1 3:1 4:1 5:1
SQL txns: 0 active, 0 total (bitcask storage)
```
The cluster is shut down by pressing Ctrl-C. Data is saved under `clusters/toydb-?/data/`,
delete the contents to start over.
## Creating Tables and Data
As a basis for later examples, we'll create a small movie database. The following SQL statements
can be pasted into `toysql`:
```sql
CREATE TABLE genres (
id INTEGER PRIMARY KEY,
name STRING NOT NULL
);
INSERT INTO genres VALUES
(1, 'Science Fiction'),
(2, 'Action'),
(3, 'Drama'),
(4, 'Comedy');
CREATE TABLE studios (
id INTEGER PRIMARY KEY,
name STRING NOT NULL
);
INSERT INTO studios VALUES
(1, 'Mosfilm'),
(2, 'Lionsgate'),
(3, 'StudioCanal'),
(4, 'Warner Bros'),
(5, 'Focus Features');
CREATE TABLE movies (
id INTEGER PRIMARY KEY,
title STRING NOT NULL,
studio_id INTEGER NOT NULL INDEX REFERENCES studios,
genre_id INTEGER NOT NULL INDEX REFERENCES genres,
released INTEGER NOT NULL,
rating FLOAT
);
INSERT INTO movies VALUES
(1, 'Stalker', 1, 1, 1979, 8.2),
(2, 'Sicario', 2, 2, 2015, 7.6),
(3, 'Primer', 3, 1, 2004, 6.9),
(4, 'Heat', 4, 2, 1995, 8.2),
(5, 'The Fountain', 4, 1, 2006, 7.2),
(6, 'Solaris', 1, 1, 1972, 8.1),
(7, 'Gravity', 4, 1, 2013, 7.7),
(8, '21 Grams', 5, 3, 2003, 7.7),
(9, 'Birdman', 4, 4, 2014, 7.7),
(10, 'Inception', 4, 1, 2010, 8.8),
(11, 'Lost in Translation', 5, 4, 2003, 7.7),
(12, 'Eternal Sunshine of the Spotless Mind', 5, 3, 2004, 8.3);
```
toyDB supports some basic datatypes, as well as primary keys, foreign keys, and column indexes.
For more information on these, see the [SQL reference](sql.md). Schema changes such as
`ALTER TABLE` are not supported, only `CREATE TABLE` and `DROP TABLE`.
The tables can be inspected via the `!tables` and `!table` commands:
```sql
toydb> !tables
genres
movies
studios
toydb> !table genres
CREATE TABLE genres (
id INTEGER PRIMARY KEY,
name STRING NOT NULL
)
```
## Constraints and Referential Integrity
Schemas enforce referential integrity and other constraints:
```sql
toydb> DROP TABLE studios;
Error: Table studios is referenced by table movies column studio_id
toydb> DELETE FROM studios WHERE id = 1;
Error: Primary key 1 is referenced by table movies column studio_id
toydb> UPDATE movies SET id = 1;
Error: Primary key 1 already exists for table movies
toydb> INSERT INTO movies VALUES (13, 'Nebraska', 6, 3, 2013, 7.7);
Error: Referenced primary key 6 in table studios does not exist
toydb> INSERT INTO movies VALUES (13, 'Nebraska', NULL, 3, 2013, 7.7);
Error: NULL value not allowed for column studio_id
toydb> INSERT INTO movies VALUES (13, 'Nebraska', 'Unknown', 3, 2013, 7.7);
Error: Invalid datatype STRING for INTEGER column studio_id
```
## Basic SQL Queries
Most basic SQL query functionality is supported:
```sql
toydb> SELECT * FROM studios;
1|Mosfilm
2|Lionsgate
3|StudioCanal
4|Warner Bros
5|Focus Features
toydb> SELECT title, rating FROM movies WHERE released >= 2000 ORDER BY rating DESC LIMIT 3;
Inception|8.8
Eternal Sunshine of the Spotless Mind|8.3
Gravity|7.7
```
Column headers can be enabled with `!headers on`:
```sql
toydb> !headers on
Headers enabled
toydb> SELECT id, name AS genre FROM genres;
id|genre
1|Science Fiction
2|Action
3|Drama
4|Comedy
```
## Expressions
All common mathematical operators are implemented:
```sql
toydb> SELECT 1 + 2 * 3;
7
toydb> SELECT (1 + 2) * 4 / -3;
-4
SELECT 3! + 7 % 4 - 2 ^ 3;
1
```
64-bit floating point arithmetic is also supported, including infinity and NaN:
```sql
toydb> SELECT 3.14 * 2.718;
8.53452
toydb> SELECT 1.0 / 0.0;
inf
toydb> SELECT 1e10 ^ 8;
100000000000000000000000000000000000000000000000000000000000000000000000000000000
toydb> SELECT 1e10 ^ 8 / INFINITY, 1e10 ^ 1e10, INFINITY / INFINITY;
0|inf|NaN
```
And of course three-valued logic:
```sql
toydb> SELECT TRUE AND TRUE, TRUE AND FALSE, TRUE AND NULL, FALSE AND NULL;
TRUE|FALSE|NULL|FALSE
toydb> SELECT TRUE OR FALSE, FALSE OR FALSE, TRUE OR NULL, FALSE OR NULL;
TRUE|FALSE|TRUE|NULL
toydb> SELECT NOT TRUE, NOT FALSE, NOT NULL;
FALSE|TRUE|NULL
```
Which would be useless without comparison operators for all types:
```sql
toydb> SELECT 3 > 1, 3 <= 1, 3 = 3.0;
TRUE|FALSE|TRUE
toydb> SELECT 'a' = 'A', 'foo' > 'bar', '👍' != '👎';
FALSE|TRUE|TRUE
toydb> SELECT INFINITY > -INFINITY, NULL = NULL;
TRUE|NULL
```
## Joins
No SQL database would be complete without joins, and toyDB supports most join types such as
inner joins (both implicit and explicit):
```sql
toydb> SELECT m.id, m.title, g.name FROM movies m JOIN genres g ON m.genre_id = g.id LIMIT 4;
1|Stalker|Science Fiction
2|Sicario|Action
3|Primer|Science Fiction
4|Heat|Action
toydb> SELECT m.id, m.title, g.name FROM movies m, genres g WHERE m.genre_id = g.id LIMIT 4;
1|Stalker|Science Fiction
2|Sicario|Action
3|Primer|Science Fiction
4|Heat|Action
```
Left and right outer joins:
```sql
toydb> SELECT s.id, s.name, g.name FROM studios s LEFT JOIN genres g ON s.id = g.id;
1|Mosfilm|Science Fiction
2|Lionsgate|Action
3|StudioCanal|Drama
4|Warner Bros|Comedy
5|Focus Features|NULL
toydb> SELECT g.id, g.name, s.name FROM genres g RIGHT JOIN studios s ON g.id = s.id;
1|Science Fiction|Mosfilm
2|Action|Lionsgate
3|Drama|StudioCanal
4|Comedy|Warner Bros
NULL|NULL|Focus Features
```
And cross joins (both implicit and explicit):
```sql
toydb> SELECT g.name, s.name FROM genres g, studios s WHERE s.name < 'S';
Science Fiction|Mosfilm
Science Fiction|Lionsgate
Science Fiction|Focus Features
Action|Mosfilm
Action|Lionsgate
Action|Focus Features
Drama|Mosfilm
Drama|Lionsgate
Drama|Focus Features
Comedy|Mosfilm
Comedy|Lionsgate
Comedy|Focus Features
```
We can join on arbitrary predicates, such as joining movies with any genres whose name is
ordered after the movie's title:
```sql
toydb> SELECT m.title, g.name
FROM movies m JOIN genres g ON g.name > m.title
ORDER BY m.title, g.name;
21 Grams|Action
21 Grams|Comedy
21 Grams|Drama
21 Grams|Science Fiction
Birdman|Comedy
Birdman|Drama
Birdman|Science Fiction
Eternal Sunshine of the Spotless Mind|Science Fiction
Gravity|Science Fiction
Heat|Science Fiction
Inception|Science Fiction
Lost in Translation|Science Fiction
Primer|Science Fiction
```
And we can join multiple tables, even using the same table multiple times - like in this example
where we find all science fiction movies released since 2000 by studios that have released any
movie rated 8 or higher:
```sql
toydb> SELECT m.id, m.title, g.name AS genre, m.released, s.name AS studio
FROM movies m JOIN genres g ON m.genre_id = g.id,
studios s JOIN movies good ON good.studio_id = s.id AND good.rating >= 8
WHERE m.studio_id = s.id AND m.released >= 2000 AND g.id = 1
ORDER BY m.title ASC;
7|Gravity|Science Fiction|2013|Warner Bros
10|Inception|Science Fiction|2010|Warner Bros
5|The Fountain|Science Fiction|2006|Warner Bros
```
## Explain
When optimizing complex queries with several joins, it can often be useful to inspect the query
plan via an `EXPLAIN` query:
```sql
toydb> EXPLAIN
SELECT m.id, m.title, g.name AS genre, m.released, s.name AS studio
FROM movies m JOIN genres g ON m.genre_id = g.id,
studios s JOIN movies good ON good.studio_id = s.id AND good.rating >= 8
WHERE m.studio_id = s.id AND m.released >= 2000 AND g.id = 1
ORDER BY m.title ASC;
Order: m.title asc
└─ Projection: m.id, m.title, g.name, m.released, s.name
└─ HashJoin: inner on m.studio_id = s.id
├─ HashJoin: inner on m.genre_id = g.id
│ ├─ Filter: m.released > 2000 OR m.released = 2000
│ │ └─ IndexLookup: movies as m column genre_id (1)
│ └─ KeyLookup: genres as g (1)
└─ HashJoin: inner on s.id = good.studio_id
├─ Scan: studios as s
└─ Scan: movies as good (good.rating > 8 OR good.rating = 8)
```
Here, we can see that the planner does a primary key lookup on `genres` and an index lookup on
`movies.genre_id`, filtering the resulting movies by release year and joining them. It also
does full table scans of `studios` and `movies` (to find the good movies) and joins them, pusing
the `rating >= 8` filter down to the `movies` table scan. The results of these two joins are also
joined to produce the final result, which is then formatted and sorted.
## Aggregates
Most basic aggregate functions are supported:
```sql
toydb> SELECT COUNT(*), MIN(rating), MAX(rating), AVG(rating), SUM(rating) FROM movies;
12|6.9|8.8|7.841666666666668|94.10000000000001
```
We can group by values and filter the aggregate results:
```sql
toydb> SELECT s.id, s.name, AVG(m.rating) AS average
FROM movies m JOIN studios s ON m.studio_id = s.id
GROUP BY s.id, s.name
HAVING average > 7.8
ORDER BY average DESC, s.name ASC;
1|Mosfilm|8.149999999999999
4|Warner Bros|7.919999999999999
5|Focus Features|7.900000000000001
```
And we can combine aggregate functions with arbitrary expressions, both inside and outside:
```sql
toydb> SELECT s.id, s.name, ((MAX(rating^2) - MIN(rating^2)) / AVG(rating^2)) ^ (0.5) AS spread
FROM movies m JOIN studios s ON m.studio_id = s.id
GROUP BY s.id, s.name
HAVING MAX(rating) - MIN(rating) > 0.5
ORDER BY spread DESC;
4|Warner Bros|0.6373540990222496
5|Focus Features|0.39194971607693424
```
## Transactions
toyDB supports ACID transactions via MVCC-based snapshot isolation. This provides atomic
transactions with good isolation, without taking out locks or blocking reads on writes. As a basic
example, the below transaction is rolled back without taking effect, as opposed to `COMMIT`
which would make it permanent:
```sql
toydb> BEGIN;
Began transaction 131
toydb:131> INSERT INTO genres VALUES (5, 'Western');
toydb:131> SELECT * FROM genres;
1|Science Fiction
2|Action
3|Drama
4|Comedy
5|Western
toydb:131> ROLLBACK;
Rolled back transaction 131
toydb> SELECT * FROM genres;
1|Science Fiction
2|Action
3|Drama
4|Comedy
```
We'll demonstrate transactions by covering most common transaction anomalies given two
concurrent sessions, and show how toyDB prevents these anomalies in all cases but one. In these
examples, the left half is user A and the right is user B. Time flows downwards such that
commands on the same line happen at the same time.
**Dirty write:** an uncommitted write by A should not be affected by a concurrent B write.
```sql
a> BEGIN;
a> INSERT INTO genres VALUES (5, 'Western');
b> INSERT INTO genres VALUES (5, 'Romance');
Error: Serialization failure, retry transaction
a> SELECT * FROM genres WHERE id = 5;
5|Western
```
The serialization failure here occurs because the first write always wins. This may not be an
optimal strategy, but it is correct in terms of preventing serialization anomalies.
**Dirty read:** an uncommitted write by A should not be visible to B until committed.
```sql
a> BEGIN;
a> INSERT INTO genres VALUES (5, 'Western');
b> SELECT * FROM genres WHERE id = 5;
No rows returned
a> COMMIT;
b> SELECT * FROM genres WHERE id = 5;
5|Western
```
**Lost update:** when A and B both read a value, before updating it in turn, the first write should
not be overwritten by the second.
```sql
a> BEGIN; b> BEGIN;
a> SELECT title, rating FROM movies WHERE id = 2; b> SELECT title, rating FROM movies WHERE id = 2;
Sicario|7.6 Sicario|7.6
a> UPDATE movies SET rating = 7.8 WHERE id = 2;
b> UPDATE movies SET rating = 7.7 WHERE id = 2;
Error: Serialization failure, retry transaction
a> COMMIT;
```
**Fuzzy read:** B should not see a value suddenly change in its transaction, even if A commits a
new value.
```sql
a> BEGIN; b> BEGIN;
b> SELECT * FROM genres WHERE id = 1;
1|Science Fiction
a> UPDATE genres SET name = 'Scifi' WHERE id = 1;
a> COMMIT;
b> SELECT * FROM genres WHERE id = 1;
1|Science Fiction
b> COMMIT;
b> SELECT * FROM genres WHERE id = 1;
1|Scifi
```
**Read skew:** if A reads two values, and B modifies the second value in between the reads, A
should see the old second value.
```sql
a> BEGIN;
a> SELECT * FROM genres WHERE id = 2;
2|Action
b> BEGIN;
b> UPDATE genres SET name = 'Drama' WHERE id = 2;
b> UPDATE genres SET name = 'Action' WHERE id = 3;
b> COMMIT;
a> SELECT * FROM genres WHERE id = 3;
3|Drama
```
**Phantom read:** when A runs a query with a predicate, and B commits a matching write, A should
not see the write when rerunning it.
```sql
a> BEGIN;
a> SELECT * FROM genres WHERE id > 2;
3|Drama
4|Comedy
b> INSERT INTO genres VALUES (5, 'Western');
a> SELECT * FROM genres WHERE id > 2;
3|Drama
4|Comedy
```
**Write skew:** when A reads row X and writes it to row Y, B should not concurrently be able to
read row Y and write it to row X.
```sql
a> BEGIN; b> BEGIN;
a> SELECT * FROM genres WHERE id = 2;
2|Action
b> SELECT * FROM genres WHERE id = 3;
3|Drama
b> UPDATE genres SET name = 'Drama' WHERE id = 2;
a> UPDATE genres SET name = 'Action' WHERE id = 3;
a> COMMIT; b> COMMIT;
```
Here, the writes actually go through. This anomaly is not protected against by snapshot isolation,
and thus not by toyDB either - doing so would require implementing serializable snapshot isolation.
However, this is the only common serialization anomaly not handled by toyDB, and is not among the
most severe.
## Time-Travel Queries
Since toyDB uses MVCC for transactions and keeps all historical versions, the state of the database
can be queried at any arbitrary point in the past. toyDB uses incremental transaction IDs as
logical timestamps:
```sql
toydb> SELECT * FROM genres;
1|Science Fiction
2|Drama
3|Action
4|Comedy
toydb> BEGIN;
Began transaction 173
toydb:173> UPDATE genres SET name = 'Scifi' WHERE id = 1;
toydb:173> INSERT INTO genres VALUES (5, 'Western');
toydb:173> COMMIT;
Committed transaction 173
toydb> SELECT * FROM genres;
1|Scifi
2|Drama
3|Action
4|Comedy
5|Western
toydb> BEGIN READ ONLY AS OF SYSTEM TIME 172;
Began read-only transaction 175 in snapshot at version 172
toydb@172> SELECT * FROM genres;
1|Science Fiction
2|Drama
3|Action
4|Comedy
```
================================================
FILE: docs/references.md
================================================
# References
This is the main research material I used while building toyDB. It is a subset of my
[reading list](https://github.com/erikgrinaker/readings).
## Introduction
Andy Pavlo's CMU lectures are an absolutely fantastic introduction to database internals:
- 🎥 [CMU 15-445 Intro to Database Systems](https://www.youtube.com/playlist?list=PLSE8ODhjZXjbohkNBWQs_otTrBTrjyohi) (A Pavlo 2019)
- 🎥 [CMU 15-721 Advanced Database Systems](https://www.youtube.com/playlist?list=PLSE8ODhjZXjasmrEd2_Yi1deeE360zv5O) (A Pavlo 2020)
Martin Kleppman has written an excellent overview of database technologies and concepts, while Alex
Petrov goes in depth on implementation of storage engines and distributed systems algorithms:
- 📖 [Designing Data-Intensive Applications](https://dataintensive.net/) (M Kleppmann 2017)
- 📖 [Database Internals](https://www.databass.dev) (A Petrov 2019)
## Raft
The Raft consensus algorithm is described in a very readable paper by Diego Ongaro, and in a talk
given by his advisor John Ousterhout:
- 📄 [In Search of an Understandable Consensus Algorithm](https://raft.github.io/raft.pdf) (D Ongaro, J Ousterhout 2014)
- 🎥 [Designing for Understandability: The Raft Consensus Algorithm](https://www.youtube.com/watch?v=vYp4LYbnnW8) (J Ousterhout 2016)
However, Raft has several subtle pitfalls, and Jon Gjengset's student guide was very helpful in
drawing attention to these:
- 🔗 [Students' Guide to Raft](https://thesquareplanet.com/blog/students-guide-to-raft/) (J Gjengset 2016)
## Parsing
Thorsten Ball has written a very enjoyable hands-on introduction to parsers where he implements
first an interpreter and then a compiler for the made-up Monkey programming language (in Go):
- 📖 [Writing An Interpreter In Go](https://interpreterbook.com) (T Ball 2016)
- 📖 [Writing A Compiler In Go](https://compilerbook.com) (T Ball 2018)
The toyDB expression parser is inspired by a blog post by Eli Bendersky describing the precedence
climbing algorithm, which is the algorithm I found the most elegant:
- 💬 [Parsing Expressions by Precedence Climbing](https://eli.thegreenplace.net/2012/08/02/parsing-expressions-by-precedence-climbing) (E Bendersky 2012)
## Transactions
Jepsen (i.e. Kyle Kingsbury) has an excellent overview of consistency and isolation models, which
is very helpful in making sense of the jungle of overlapping and ill-defined terms:
- 🔗 [Consistency Models](https://jepsen.io/consistency) (Jepsen 2016)
For more background on this, in particular on how snapshot isolation provided by the MVCC
transaction engine used in toyDB does not fit into the traditional SQL isolation levels, the
following classic papers were useful:
- 📄 [A Critique of ANSI SQL Isolation Levels](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-95-51.pdf) (H Berenson et al 1995)
- 📄 [Generalized Isolation Level Definitions](http://pmg.csail.mit.edu/papers/icde00.pdf) (A Adya, B Liskov, P ONeil 2000)
As for actually implementing MVCC, I found blog posts to be the most helpful:
- 💬 [Implementing Your Own Transactions with MVCC](https://levelup.gitconnected.com/implementing-your-own-transactions-with-mvcc-bba11cab8e70) (E Chance 2015)
- 💬 [How Postgres Makes Transactions Atomic](https://brandur.org/postgres-atomicity) (B Leach 2017)
================================================
FILE: docs/sql.md
================================================
# SQL Reference
## Data Types
The following data types are supported:
* `BOOLEAN` (`BOOL`): logical truth values, i.e. true and false.
* `FLOAT` (`DOUBLE`): 64-bit signed floating point numbers, using [IEEE 754 `binary64`](https://en.wikipedia.org/wiki/binary64) encoding. Supports magnitudes of 10⁻³⁰⁷ to 10³⁰⁸ with 53-bit precision (~15 significant figures), as well as the special values infinity and NaN.
* `INTEGER` (`INT`): 64-bit signed integer numbers with a range of ±2⁶³-1.
* `STRING` (`TEXT`, `VARCHAR`): UTF-8 encoded strings.
In addition, the special `NULL` value is used for an unknown value, following the rules of [three-valued logic](https://en.wikipedia.org/wiki/Three-valued_logic).
Numeric types are not interchangable; a float value (even without a fractional part) cannot be stored in an integer column and vice-versa.
## SQL Syntax
### Keywords
Keywords are reserved words with special meaning in SQL statements. They are case-insensitive, and must be quoted with `"` to be used as identifiers. The complete list is:
`AS`, `ASC`, `AND`, `BEGIN`, `BOOL`, `BOOLEAN`, `BY`, `COMMIT`, `CREATE`, `CROSS`, `DEFAULT`,`DELETE`, `DESC`, `DOUBLE`, `DROP`, `EXISTS`, `EXPLAIN`, `FALSE`, `FLOAT`, `FROM`, `GROUP`, `HAVING`, `IF`, `INDEX`, `INFINITY`, `INNER`, `INSERT`, `INT`, `INTEGER`, `INTO`, `IS`, `JOIN`, `KEY`, `LEFT`, `LIKE`, `LIMIT`, `NAN`, `NOT`, `NULL`, `OF`, `OFFSET`, `ON`, `ONLY`, `OR`, `ORDER`, `OUTER`, `PRIMARY`, `READ`, `REFERENCES`, `RIGHT`, `ROLLBACK`, `SELECT`, `SET`, `STRING`, `SYSTEM`, `TABLE`, `TEXT`, `TIME`, `TRANSACTION`, `TRUE`, `UNIQUE`, `UPDATE`, `VALUES`, `VARCHAR`, `WHERE`, `WRITE`
### Identifiers
Identifiers are names for database objects such as tables and columns. Unless quoted with `"`, they must begin with a Unicode letter followed by any combination of letters, numbers, and `_`, and cannot be reserved keywords. `""` can be used to escape a double quote character. They are always converted to lowercase.
### Constants
#### Named constants
The following keywords evaluate to constants:
* `FALSE`: the boolean false value.
* `INFINITY`: the floating-point value for infinity.
* `NAN`: the floating-point value for NaN (not a number).
* `NULL`: an unknown value.
* `TRUE`: the boolean true value.
#### String literals
String literals are surrounded by single quotes `'`, and can contain any valid UTF-8 character. Single quotes must be escaped by an additional single quote, i.e. `''`, no other escape sequences are supported. For example:
```
'A string with ''quotes'' and emojis 😀'
```
#### Numeric literals
Sequences of digits `0-9` are parsed as a 64-bit signed integer. Numbers with decimal points or in scientific notation are parsed as 64-bit floating point numbers. The following pattern is supported:
```
999[.[999]][e[+-]999]
```
The `-` prefix operator can be used to take negative numbers.
### Expressions
Expressions can be used wherever a value is expected, e.g. as `SELECT` columns nd `INSERT` values. They are made up of constants, a column references, an operator invocations, and a function calls.
Column references can either be unqualified, e.g. `name`, or prefixed with the relation identifier separated by `.`, e.g. `person.name`. Unqualified identifiers must be unambiguous.
## SQL Operators
### Logical operators
Logical operators apply standard logic operations on boolean operands.
* `AND`: the logical conjunction, e.g. `TRUE AND TRUE` yields `TRUE`.
* `OR`: the logical disjunction, e.g. `TRUE OR FALSE` yields `TRUE`.
* `NOT`: the logical negation, e.g. `NOT TRUE` yields `FALSE`.
The complete truth tables are:
| `AND` | `TRUE` | `FALSE` | `NULL` |
|-------------|---------|---------|---------|
| **`TRUE`** | `TRUE` | `FALSE` | `NULL` |
| **`FALSE`** | `FALSE` | `FALSE` | `FALSE` |
| **`NULL`** | `NULL` | `FALSE` | `NULL` |
| `OR` | `TRUE` | `FALSE` | `NULL` |
|-------------|--------|---------|--------|
| **`TRUE`** | `TRUE` | `TRUE` | `TRUE` |
| **`FALSE`** | `TRUE` | `FALSE` | `NULL` |
| **`NULL`** | `TRUE` | `NULL` | `NULL` |
| `NOT` | |
|-------------|---------|
| **`TRUE`** | `FALSE` |
| **`FALSE`** | `TRUE` |
| **`NULL`** | `NULL` |
### Comparison operators
Comparison operators compare values of the same data type, and return `TRUE` if the comparison holds or `FALSE` otherwise. `INTEGER` and `FLOAT` values are interchangeable. `STRING` comparisons use the string's byte values, i.e. case-sensitive with `'B' < 'a'` due to their UTF-8 code points. `FALSE` is considered lesser than `TRUE`. Comparison with `NULL` always yields `NULL` (even `NULL = NULL`).
Binary operators:
* `=`: equality, e.g. `1 = 1` yields `TRUE`.
* `!=`: inequality, e.g. `1 != 2` yields `TRUE`.
* `>`: greater than, e.g. `2 > 1` yields `TRUE`.
* `>=`: greater than or equal, e.g. `1 >= 1` yields `TRUE`.
* `<`: lesser than, e.g. `1 < 2` yields `TRUE`.
* `<=`: lesser than or equal, e.g. `1 <= 1` yields `TRUE`.
Unary operators:
* `IS NULL`: checks if the value is `NULL`, e.g. `NULL IS NULL` yields `TRUE`.
* `IS NOT NULL`: checks if the value is not `NULL`, e.g. `TRUE IS NOT NULL` yields `TRUE`.
* `IS NAN`: checks if the value is a float `NAN`, e.g. `NAN IS NAN` yields `TRUE`. Errors on
non-float datatypes, except `NULL` which yields `NULL`.
* `IS NOT NAN`: checks if the value is not a float `NAN`, e.g. `3.14 IS NOT NAN` yields `TRUE`.
### Mathematical operators
Mathematical operators apply standard math operations on numeric (`INTEGER` or `FLOAT`) operands. If either operand is a `FLOAT`, both operands are converted to `FLOAT` and the result is a `FLOAT`. If either operand is `NULL`, the result is `NULL`. The special values `INFINITY` and `NAN` are handled according to the IEEE 754 spec.
For `INTEGER` operands, failure conditions such as overflow and division by zero yield an error. For `FLOAT` operands, these return `INFINITY` or `NAN` as appropriate.
Binary operators:
* `+`: addition, e.g. `1 + 2` yields `3`.
* `-`: subtraction, e.g. `3 - 2` yields `1`.
* `*`: multiplication, e.g. `3 * 2` yields `6`.
* `/`: division, e.g. `6 / 2` yields `3`.
* `^`: exponentiation, e.g. `2 ^ 4` yields `16`.
* `%`: remainder, e.g. `8 % 3` yields `2`. Unlike modulo, the result has the sign of the dividend.
Unary operators:
* `+` (prefix): identity, e.g. `+1` yields `1`.
* `-` (prefix): negation, e.g. `- -2` yields `2`.
* `!` (postfix): factorial, e.g. `5!` yields `15`.
### String operators
String operators operate on string operands.
* `LIKE`: compares a string with the given pattern, using `%` as multi-character wildcard and `_` as single-character wildcard, returning `TRUE` if the string matches the pattern - e.g. `'abc' LIKE 'a%'` yields `TRUE`.
### Operator precedence
The operator precedence (order of operations) is as follows:
| Precedence | Operator | Associativity |
|------------|-------------------------|---------------|
| 10 | `+`, `-` (prefix) | Right |
| 9 | `!` (postfix) | Left |
| 8 | `^` | Right |
| 7 | `*`, `/`, `%` | Left |
| 6 | `+`, `-` | Left |
| 5 | `>`, `>=`, `<`, `<=` | Left |
| 4 | `=`, `!=`, `LIKE`, `IS` | Left |
| 3 | `NOT` | Right |
| 2 | `AND` | Left |
| 1 | `OR` | Left |
Precedence can be overridden by wrapping an expression in parentheses, e.g. `(1 + 2) * 3`.
### Functions
* `sqrt(expr)`: returns the square root of a numerical argument.
### Aggregate functions
Aggregate function aggregate an expression across all rows, optionally grouped into buckets given by `GROUP BY`, and results can be filtered via `HAVING`.
* `AVG(expr)`: returns the average of numerical values.
* `COUNT(expr)`: returns the number of rows for which ***`expr`*** evaluates to a non-`NULL` value. `COUNT(*)` can be used to count all rows.
* `MAX(expr)`: returns the maximum value, according to the datatype's ordering.
* `MIN(expr)`: returns the minimum value, according to the datatype's ordering.
* `SUM(expr)`: returns the sum of numerical values.
## SQL Statements
### `BEGIN`
Starts a new [transaction](#transactions).
<pre>
BEGIN [ TRANSACTION ] [ READ ONLY | READ WRITE ] [ AS OF SYSTEM TIME <b><i>txn_id</i></b> ]
</pre>
* ***`txn_id`***: A past transaction ID to run a read-only transaction for, for time-travel queries.
### `COMMIT`
Commits an active [transaction](#transactions).
### `CREATE TABLE`
Creates a new table.
<pre>
CREATE TABLE <b><i>table_name</i></b> (
[ <b><i>column_name</i></b> <b><i>data_type</i></b> [ <b><i>column_constraint</i></b> [ ... ] ] [ INDEX ] [, ... ] ]
)
where <b><i>column_constraint</i></b> is:
{ NOT NULL | NULL | PRIMARY KEY | DEFAULT <b><i>expr</i></b> | REFERENCES <b><i>ref_table</i></b> | UNIQUE }
</pre>
* ***`table_name`***: The name of the table. Must be a [valid identifier](#identifiers). Errors if a table with this name already exists.
* ***`column_name`***: The name of the column. Must be a [valid identifier](#identifiers), and unique within the table.
* ***`data_type`***: The data type of the column, see [data types](#data-types) for valid types.
* `NOT NULL`: The column may not contain `NULL` values.
* `NULL`: The column may contain `NULL` values. This is the default.
* `PRIMARY KEY`: The column should act as a primary key, i.e. the main row identifier. A table must have exactly one primary key column, and it must be unique and non-nullable.
* `DEFAULT`***`expr`***: Specifies a default value for the column when `INSERT` statements do not give a value. ***`expr`*** can be any constant expression of an appropriate data type, e.g. `'abc'` or `1 + 2 * 3`. For nullable columns, the default value is `NULL` unless specified otherwise.
* `REFERENCES`***`ref_table`***: The column is a foreign key to ***`ref_table`***'s primary key, enforcing referential integrity.
* `UNIQUE`: The column may only contain unique (distinct) values. `NULL` values are not considered equal, thus a `UNIQUE` column which allows `NULL` may contain multiple `NULL` values. `PRIMARY KEY` columns are implicitly `UNIQUE`.
* `INDEX`: Create an index for the column.
#### Example
```sql
CREATE TABLE movie (
id INTEGER PRIMARY KEY,
title STRING NOT NULL,
release_year INTEGER INDEX,
imdb_id STRING INDEX UNIQUE,
bluray BOOLEAN NOT NULL DEFAULT TRUE
)
```
### `DELETE`
Deletes rows in a table.
<pre>
DELETE FROM <b><i>table_name</i></b>
[ WHERE <b><i>predicate</i></b> ]
</pre>
Deletes rows where ***`predicate`*** evaluates to `TRUE`, or all rows if no `WHERE` clause is given.
* ***`table_name`***: the table to delete from. Errors if it does not exist.
* ***`predicate`***: an expression which determines which rows to delete by evaluting to `TRUE`. Must evaluate to a `BOOLEAN` or `NULL`, otherwise an error is returned.
#### Example
```sql
DELETE FROM movie
WHERE release_year < 2000 AND bluray = FALSE
```
### `DROP TABLE`
Deletes a table and all contained data. Errors if the table does not
exist, unless `IF EXISTS` is given.
<pre>
DROP TABLE [ IF EXISTS ] <b><i>table_name</i></b>
</pre>
* ***`table_name`***: the table to delete.
### `EXPLAIN`
Outputs the execution plan for the given statement.
<pre>
EXPLAIN [ <b><i>statement</i></b> ]
</pre>
### `INSERT`
Inserts rows into a table.
<pre>
INSERT INTO <b><i>table_name</i></b>
[ ( <b><i>column_name</i></b> [, ... ] ) ]
VALUES ( <b><i>expression</i></b> [, ... ] ) [, ... ]
</pre>
If column names are given, an identical number of values must be given. If no column names are given, values must be given in the table's column order. Omitted columns will get a default value if specified, otherwise an error will be returned.
* ***`table_name`***: the table to insert into. Errors if it does not exist.
* ***`column_name`***: a column to insert into in the given table. Errors if it does not exist.
* ***`expression`***: an expression to insert into the corresponding column. Must be a constant expression, i.e. it cannot refer to table columns.
#### Example
```sql
INSERT INTO movie
(id, title, release_year)
VALUES
(1, 'Sicario', 2015),
(2, 'Stalker', 1979),
(3, 'Her', 2013)
```
### `ROLLBACK`
Rolls back an active [transaction](#transactions).
### `SELECT`
Selects rows from a table.
<pre>
SELECT [ * | <b><i>expression</i></b> [ [ AS ] <b><i>output_name</i></b> [, ...] ] ]
[ FROM <b><i>from_item</i></b> [, ...] ]
[ WHERE <b><i>predicate</i></b> ]
[ GROUP BY <b><i>group_expr</i></b> [, ...] ]
[ HAVING <b><i>having_expr</i></b> ]
[ ORDER BY <b><i>order_expr</i></b> [ ASC | DESC ] [, ...] ]
[ LIMIT <b><i>count</i></b> ]
[ OFFSET <b><i>start</i></b> ]
where <b><i>from_item</i></b> is one of:
<b><i>table_name</i></b> [ [ AS ] <b><i>alias</i></b> ]
<b><i>from_item</i></b> <b><i>join_type</i></b> <b><i>from_item</i></b> [ ON <b><i>join_predicate</i></b> ]
where <b><i>join_type</i></b> is one of:
CROSS JOIN
[ INNER ] JOIN
LEFT [ OUTER ] JOIN
RIGHT [ OUTER ] JOIN
</pre>
Fetches rows or expressions, either from table ***`table_name`*** (if given) or generated.
* ***`expression`***: [expression](#expressions) to fetch (can be a simple column name).
* ***`output_name`***: output column [identifier](#identifier), defaults to column name (if single column) otherwise nothing (displayed as `?`).
* ***`table_name`***: table to fetch rows from.
* ***`alias`***: table alias.
* ***`predicate`***: only return rows for which this [expression](#expressions) evaluates to `TRUE`.
* ***`group_expr`***: an expression to group aggregates by. Non-aggregate `SELECT` expressions must either reference a column given in `group_expr`, be idential with a `group_expr`, or have an `output_name` that is referenced by a `group_expr` column.
* ***`having_expr`***: only return aggregate results for which this [expression](#expressions) evaluates to `TRUE`.
* ***`order_expr`***: order rows by this expression (can be a simple column name).
* ***`count`***: maximum number of rows to return. Must be a constant integer expression.
* ***`start`***: number of rows to skip. Must be a constant integer expression.
* ***`join_predicate`***: only return rows for which this [expression](#expressions) evaluates to `TRUE`.
Join types:
* `CROSS JOIN`: returns the Carthesian product of the joined tables. Does not accept a join predicate (`ON` clause).
* `INNER JOIN`: returns the rows of the tables' Carthesian product for which ***`join_predicate`*** evaluates to `TRUE`.
* `LEFT OUTER JOIN`: returns the rows joined on the ***`join_predicate`***, or for any rows in the left table that does not have a match in the right table a single row is returned with the right table's columns set to `NULL`.
* `RIGHT OUTER JOIN`: the same as a `LEFT OUTER JOIN` but with the left and right tables switched.
#### Example
```sql
SELECT id, title, 2020 - released AS age
FROM movies
WHERE released >= 2000 AND ultrahd
ORDER BY released DESC, title ASC
LIMIT 10
OFFSET 10
```
### `UPDATE`
Updates rows in a table.
<pre>
UPDATE <b><i>table_name</i></b>
SET <b><i>column_name</i></b> = <b><i>expression</i></b> | DEFAULT [, ... ]
[ WHERE <b><i>predicate</i></b> ]
</pre>
Updates columns given by ***`column_name`*** to the corresponding ***`expression`*** for all rows where ***`predicate`*** evaluates to `TRUE`. If no `WHERE` clause is given, all rows are updated.
* ***`table_name`***: the table to update. Errors if it does not exist.
* ***`column_name`***: a column to update. Errors if it does not exist.
* ***`expression`***: an expression whose evaluated value will be set for the corresponding column and row. Expressions can refer to column values, and must evaluate to the same datatype as the updated column. Using `DEFAULT` will set the column's default value, if any.
* ***`predicate`***: an expression which determines which rows to update by evaluting to `TRUE`. Must evaluate to a `BOOLEAN` or `NULL`, otherwise an error is returned.
#### Example
```sql
UPDATE movie
SET bluray = TRUE
WHERE release_year >= 2000 AND bluray = FALSE
```
## Transactions
toyDB supports ACID transactions using MVCC-based snapshot isolation, protecting from the following anomalies: dirty writes, dirty reads, lost updates, fuzzy reads, read skew, and phantom reads. However, write skew anomalies are possible since serializable snapshot isolation is not implemented.
A new transaction is started with `BEGIN`, and ended with either `COMMIT` (atomically writing all changes) or `ROLLBACK` (discarding all changes). If any conflicts occur between concurrent transactions, the lowest transaction ID wins and the others will fail with a serialization error and must retry.
All past data is versioned and retained, and can be queried as of a given transaction ID via `BEGIN TRANSACTION READ ONLY AS OF SYSTEM TIME <txn_id>`.
A transaction is still valid for use if a contained statement returns an error. It is up to the client to take appropriate action.
================================================
FILE: docs/tools/update-links.py
================================================
#!/usr/bin/env python3
#
# Updates GitHub code links to the latest commit SHA.
import os, re, sys, argparse
import requests
GITHUB_API = "https://api.github.com"
def get_latest_sha(owner, repo, path, token):
url = f"{GITHUB_API}/repos/{owner}/{repo}/commits"
headers = {}
if token:
headers["Authorization"] = f"token {token}"
params = {"path": path, "sha": "main", "per_page": 1}
resp = requests.get(url, headers=headers, params=params)
resp.raise_for_status()
data = resp.json()
return data[0]["sha"] if data else None
def process_markdown(text, token):
pattern = re.compile(
r"https://github\.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)/blob/"
r"(?P<oldsha>[0-9a-f]{7,40})/(?P<path>[^#)\s]+)"
)
cache = {}
def replacer(m):
print(f"Checking {m.group(0)}")
owner, repo, oldsha, path = m.group("owner","repo","oldsha","path")
key = (owner, repo, path)
print(f"Key: {key}")
if key not in cache:
cache[key] = get_latest_sha(owner, repo, path, token)
newsha = cache[key]
if newsha and newsha != oldsha:
print(f"Updating {m.group(0)} to {newsha}")
return m.group(0).replace(oldsha, newsha)
return m.group(0)
return pattern.sub(replacer, text)
def main():
p = argparse.ArgumentParser(description="Update GitHub blob links to latest SHAs")
p.add_argument("file", nargs="?", help="Markdown file to update (defaults to stdin/stdout)")
args = p.parse_args()
token = os.getenv("GITHUB_TOKEN")
if args.file:
text = open(args.file, encoding="utf-8").read()
updated = process_markdown(text, token)
with open(args.file, "w", encoding="utf-8") as f:
f.write(updated)
else:
text = sys.stdin.read()
sys.stdout.write(process_markdown(text, token))
if __name__ == "__main__":
main()
================================================
FILE: rust-toolchain
================================================
1.93.1
================================================
FILE: rustfmt.toml
================================================
use_small_heuristics = "Max"
================================================
FILE: src/bin/toydb.rs
================================================
//! The toyDB server. Takes configuration from a config file (default
//! config/toydb.yaml) or corresponding TOYDB_ environment variables. Listens
//! for SQL clients (default port 9601) and Raft connections from other toyDB
//! peers (default port 9701). The Raft log and SQL database are stored at
//! data/raft and data/sql by default.
//!
//! Use the toysql command-line client to connect to the server.
#![warn(clippy::all)]
use std::collections::HashMap;
use std::path::Path;
use clap::Parser as _;
use serde::Deserialize;
use toydb::Server;
use toydb::errinput;
use toydb::error::Result;
use toydb::raft;
use toydb::sql;
use toydb::storage;
fn main() {
if let Err(error) = Command::parse().run() {
eprintln!("Error: {error}")
}
}
/// The toyDB server configuration. Can be provided via config file (default
/// config/toydb.yaml) or TOYDB_ environment variables.
#[derive(Debug, Deserialize)]
struct Config {
/// The node ID. Must be unique in the cluster.
id: raft::NodeID,
/// The other nodes in the cluster, and their Raft TCP addresses.
peers: HashMap<raft::NodeID, String>,
/// The Raft listen address.
listen_raft: String,
/// The SQL listen address.
listen_sql: String,
/// The log level.
log_level: String,
/// The path to this node's data directory. The Raft log is stored in
/// the file "raft", and the SQL state machine in "sql".
data_dir: String,
/// The Raft storage engine: bitcask or memory.
storage_raft: String,
/// The SQL storage engine: bitcask or memory.
storage_sql: String,
/// If false, don't fsync Raft log writes to disk. Disabling this
/// will yield much better write performance, but may lose data on
/// host crashes which compromises Raft safety guarantees.
fsync: bool,
/// The garbage fraction threshold at which to trigger compaction.
compact_threshold: f64,
/// The minimum bytes of garbage before triggering compaction.
compact_min_bytes: u64,
}
impl Config {
/// Loads the configuration from the given file.
fn load(file: &str) -> Result<Self> {
Ok(config::Config::builder()
.set_default("id", "1")?
.set_default("listen_sql", "localhost:9601")?
.set_default("listen_raft", "localhost:9701")?
.set_default("log_level", "info")?
.set_default("data_dir", "data")?
.set_default("storage_raft", "bitcask")?
.set_default("storage_sql", "bitcask")?
.set_default("fsync", true)?
.set_default("compact_threshold", 0.2)?
.set_default("compact_min_bytes", 1_000_000)?
.add_source(config::File::with_name(file))
.add_source(config::Environment::with_prefix("TOYDB"))
.build()?
.try_deserialize()?)
}
}
/// The toyDB server command.
#[derive(clap::Parser)]
#[command(about = "Starts a toyDB server.", version, propagate_version = true)]
struct Command {
/// The configuration file path.
#[arg(short = 'c', long, default_value = "config/toydb.yaml")]
config: String,
}
impl Command {
/// Runs the toyDB server.
fn run(self) -> Result<()> {
// Load the c
gitextract_nc06cv1f/
├── .github/
│ └── workflows/
│ └── ci.yml
├── .gitignore
├── Cargo.toml
├── LICENSE
├── README.md
├── cluster/
│ ├── run.sh
│ ├── toydb1/
│ │ └── toydb.yaml
│ ├── toydb2/
│ │ └── toydb.yaml
│ ├── toydb3/
│ │ └── toydb.yaml
│ ├── toydb4/
│ │ └── toydb.yaml
│ └── toydb5/
│ └── toydb.yaml
├── config/
│ └── toydb.yaml
├── docs/
│ ├── architecture/
│ │ ├── README.md
│ │ ├── client.md
│ │ ├── encoding.md
│ │ ├── index.md
│ │ ├── mvcc.md
│ │ ├── overview.md
│ │ ├── raft.md
│ │ ├── server.md
│ │ ├── sql-data.md
│ │ ├── sql-execution.md
│ │ ├── sql-optimizer.md
│ │ ├── sql-parser.md
│ │ ├── sql-planner.md
│ │ ├── sql-raft.md
│ │ ├── sql-storage.md
│ │ ├── sql.md
│ │ └── storage.md
│ ├── architecture.md
│ ├── crate/
│ │ ├── Cargo.toml
│ │ ├── README.md
│ │ └── src/
│ │ └── lib.rs
│ ├── examples.md
│ ├── references.md
│ ├── sql.md
│ └── tools/
│ └── update-links.py
├── rust-toolchain
├── rustfmt.toml
├── src/
│ ├── bin/
│ │ ├── toydb.rs
│ │ ├── toydump.rs
│ │ ├── toysql.rs
│ │ └── workload.rs
│ ├── client.rs
│ ├── encoding/
│ │ ├── bincode.rs
│ │ ├── format.rs
│ │ ├── keycode.rs
│ │ └── mod.rs
│ ├── error.rs
│ ├── lib.rs
│ ├── raft/
│ │ ├── log.rs
│ │ ├── message.rs
│ │ ├── mod.rs
│ │ ├── node.rs
│ │ ├── state.rs
│ │ └── testscripts/
│ │ ├── log/
│ │ │ ├── append
│ │ │ ├── commit
│ │ │ ├── get
│ │ │ ├── has
│ │ │ ├── init
│ │ │ ├── scan
│ │ │ ├── scan_apply
│ │ │ ├── splice
│ │ │ ├── status
│ │ │ └── term
│ │ └── node/
│ │ ├── append
│ │ ├── append_base_missing
│ │ ├── append_base_missing_all
│ │ ├── append_commit_quorum
│ │ ├── append_initial
│ │ ├── append_max_entries
│ │ ├── append_pipeline
│ │ ├── append_probe_divergent_first
│ │ ├── append_probe_divergent_long
│ │ ├── append_probe_divergent_short
│ │ ├── append_probe_divergent_single
│ │ ├── append_response_beyond_last_index_panics
│ │ ├── append_response_stale_reject
│ │ ├── election
│ │ ├── election_candidate_behind_leader
│ │ ├── election_candidate_behind_quorum
│ │ ├── election_contested
│ │ ├── election_tie
│ │ ├── election_tie_even
│ │ ├── heartbeat_commits_follower
│ │ ├── heartbeat_converts_candidate
│ │ ├── heartbeat_converts_follower
│ │ ├── heartbeat_converts_follower_leaderless
│ │ ├── heartbeat_converts_leader
│ │ ├── heartbeat_lost_append_duplicate
│ │ ├── heartbeat_lost_append_multiple
│ │ ├── heartbeat_lost_append_single
│ │ ├── heartbeat_lost_read
│ │ ├── heartbeat_match_commits
│ │ ├── heartbeat_multiple_leaders_panic
│ │ ├── heartbeat_old_commit_index
│ │ ├── heartbeat_old_last_index
│ │ ├── heartbeat_probe_divergent
│ │ ├── old_campaign_rejected
│ │ ├── old_campaign_response_ignored
│ │ ├── old_heartbeat_ignored
│ │ ├── request_candidate_abort
│ │ ├── request_follower
│ │ ├── request_follower_campaign_abort
│ │ ├── request_follower_disconnect_stall
│ │ ├── request_follower_leaderless_abort
│ │ ├── request_leader
│ │ ├── request_leader_campaign_abort
│ │ ├── request_leader_change_linearizability
│ │ ├── request_leader_disconnect
│ │ ├── request_leader_read_quorum
│ │ ├── request_leader_read_quorum_sequence
│ │ ├── request_leader_single
│ │ ├── request_status
│ │ ├── request_status_single
│ │ ├── restart
│ │ ├── restart_apply
│ │ ├── restart_commit_recover
│ │ ├── restart_term_vote
│ │ ├── tick_candidate
│ │ ├── tick_follower
│ │ ├── tick_follower_leaderless
│ │ └── tick_leader
│ ├── server.rs
│ ├── sql/
│ │ ├── engine/
│ │ │ ├── engine.rs
│ │ │ ├── local.rs
│ │ │ ├── mod.rs
│ │ │ └── raft.rs
│ │ ├── execution/
│ │ │ ├── aggregator.rs
│ │ │ ├── executor.rs
│ │ │ ├── join.rs
│ │ │ ├── mod.rs
│ │ │ └── session.rs
│ │ ├── mod.rs
│ │ ├── parser/
│ │ │ ├── ast.rs
│ │ │ ├── lexer.rs
│ │ │ ├── mod.rs
│ │ │ └── parser.rs
│ │ ├── planner/
│ │ │ ├── mod.rs
│ │ │ ├── optimizer.rs
│ │ │ ├── plan.rs
│ │ │ └── planner.rs
│ │ ├── testscripts/
│ │ │ ├── expressions/
│ │ │ │ ├── cnf
│ │ │ │ ├── func
│ │ │ │ ├── func_sqrt
│ │ │ │ ├── literals
│ │ │ │ ├── op_compare_equal
│ │ │ │ ├── op_compare_greater
│ │ │ │ ├── op_compare_greater_equal
│ │ │ │ ├── op_compare_is_nan
│ │ │ │ ├── op_compare_is_null
│ │ │ │ ├── op_compare_lesser
│ │ │ │ ├── op_compare_lesser_equal
│ │ │ │ ├── op_compare_not_equal
│ │ │ │ ├── op_logic_and
│ │ │ │ ├── op_logic_not
│ │ │ │ ├── op_logic_or
│ │ │ │ ├── op_math_add
│ │ │ │ ├── op_math_divide
│ │ │ │ ├── op_math_exponentiate
│ │ │ │ ├── op_math_factorial
│ │ │ │ ├── op_math_identity
│ │ │ │ ├── op_math_multiply
│ │ │ │ ├── op_math_negate
│ │ │ │ ├── op_math_remainder
│ │ │ │ ├── op_math_subtract
│ │ │ │ ├── op_precedence
│ │ │ │ └── op_string_like
│ │ │ ├── optimizers/
│ │ │ │ ├── constant_folder
│ │ │ │ ├── filter_pushdown
│ │ │ │ ├── hash_join
│ │ │ │ ├── index_lookup
│ │ │ │ └── short_circuit
│ │ │ ├── queries/
│ │ │ │ ├── aggregate
│ │ │ │ ├── clauses
│ │ │ │ ├── group_by
│ │ │ │ ├── having
│ │ │ │ ├── join_cross
│ │ │ │ ├── join_inner
│ │ │ │ ├── join_outer
│ │ │ │ ├── limit
│ │ │ │ ├── offset
│ │ │ │ ├── order
│ │ │ │ ├── select
│ │ │ │ ├── where_
│ │ │ │ ├── where_index
│ │ │ │ └── where_primary_key
│ │ │ ├── schema/
│ │ │ │ ├── create_table
│ │ │ │ ├── create_table_datatypes
│ │ │ │ ├── create_table_default
│ │ │ │ ├── create_table_index
│ │ │ │ ├── create_table_names
│ │ │ │ ├── create_table_null
│ │ │ │ ├── create_table_primary_key
│ │ │ │ ├── create_table_reference
│ │ │ │ ├── create_table_transaction
│ │ │ │ ├── create_table_unique
│ │ │ │ ├── drop_table
│ │ │ │ ├── drop_table_index
│ │ │ │ ├── drop_table_ref
│ │ │ │ └── drop_table_transaction
│ │ │ ├── transactions/
│ │ │ │ ├── anomaly_dirty_read
│ │ │ │ ├── anomaly_dirty_write
│ │ │ │ ├── anomaly_fuzzy_read
│ │ │ │ ├── anomaly_lost_update
│ │ │ │ ├── anomaly_phantom_read
│ │ │ │ ├── anomaly_read_skew
│ │ │ │ ├── anomaly_write_skew
│ │ │ │ ├── begin
│ │ │ │ ├── commit
│ │ │ │ ├── isolation
│ │ │ │ ├── rollback
│ │ │ │ └── schema
│ │ │ └── writes/
│ │ │ ├── delete
│ │ │ ├── delete_index
│ │ │ ├── delete_reference
│ │ │ ├── delete_where
│ │ │ ├── insert
│ │ │ ├── insert_datatypes
│ │ │ ├── insert_default
│ │ │ ├── insert_index
│ │ │ ├── insert_null
│ │ │ ├── insert_primary_key
│ │ │ ├── insert_reference
│ │ │ ├── insert_unique
│ │ │ ├── update
│ │ │ ├── update_datatypes
│ │ │ ├── update_default
│ │ │ ├── update_expression
│ │ │ ├── update_index
│ │ │ ├── update_null
│ │ │ ├── update_primary_key
│ │ │ ├── update_reference
│ │ │ ├── update_unique
│ │ │ └── update_where
│ │ └── types/
│ │ ├── expression.rs
│ │ ├── mod.rs
│ │ ├── schema.rs
│ │ └── value.rs
│ └── storage/
│ ├── bitcask.rs
│ ├── engine.rs
│ ├── memory.rs
│ ├── mod.rs
│ ├── mvcc.rs
│ └── testscripts/
│ ├── bitcask/
│ │ ├── compact
│ │ ├── compact_open
│ │ ├── log
│ │ └── status
│ ├── engine/
│ │ ├── keys
│ │ ├── point
│ │ ├── scan
│ │ └── scan_prefix
│ ├── memory/
│ │ └── status
│ └── mvcc/
│ ├── anomaly_dirty_read
│ ├── anomaly_dirty_write
│ ├── anomaly_fuzzy_read
│ ├── anomaly_lost_update
│ ├── anomaly_phantom_read
│ ├── anomaly_read_skew
│ ├── anomaly_write_skew
│ ├── bank
│ ├── begin
│ ├── begin_as_of
│ ├── begin_readonly
│ ├── delete
│ ├── delete_conflict
│ ├── get
│ ├── get_isolation
│ ├── resume
│ ├── rollback
│ ├── scan
│ ├── scan_isolation
│ ├── scan_key_version_encoding
│ ├── scan_prefix
│ ├── set
│ ├── set_conflict
│ └── unversioned
└── tests/
├── scripts/
│ ├── anomalies
│ ├── client
│ ├── errors
│ ├── isolation
│ └── queries
├── testcluster.rs
└── tests.rs
SYMBOL INDEX (962 symbols across 40 files)
FILE: docs/tools/update-links.py
function get_latest_sha (line 10) | def get_latest_sha(owner, repo, path, token):
function process_markdown (line 21) | def process_markdown(text, token):
function main (line 41) | def main():
FILE: src/bin/toydb.rs
function main (line 24) | fn main() {
type Config (line 33) | struct Config {
method load (line 63) | fn load(file: &str) -> Result<Self> {
type Command (line 85) | struct Command {
method run (line 93) | fn run(self) -> Result<()> {
FILE: src/bin/toydump.rs
function main (line 13) | fn main() {
type Command (line 22) | struct Command {
method run (line 35) | fn run(self) -> Result<()> {
FILE: src/bin/toysql.rs
function main (line 23) | fn main() {
type Command (line 32) | struct Command {
method run (line 46) | fn run(self) -> Result<()> {
type Shell (line 56) | struct Shell {
method new (line 69) | fn new(host: &str, port: u16) -> Result<Self> {
method execute (line 84) | fn execute(&mut self, input: &str) -> Result<()> {
method execute_command (line 95) | fn execute_command(&mut self, input: &str) -> Result<()> {
method execute_sql (line 176) | fn execute_sql(&mut self, statement: &str) -> Result<()> {
method prompt (line 207) | fn prompt(&mut self) -> rustyline::Result<String> {
method run (line 217) | fn run(&mut self) -> Result<()> {
type InputValidator (line 258) | struct InputValidator;
method validate (line 261) | fn validate(&self, ctx: &mut ValidationContext) -> rustyline::Result<Val...
method validate_while_typing (line 276) | fn validate_while_typing(&self) -> bool {
FILE: src/bin/workload.rs
function main (line 28) | fn main() {
type Command (line 43) | struct Command {
type Subcommand (line 52) | enum Subcommand {
type Runner (line 60) | struct Runner {
method run (line 85) | fn run<W: Workload>(self, workload: W) -> Result<()> {
type Workload (line 184) | trait Workload: std::fmt::Display {
method prepare (line 189) | fn prepare(&self, client: &mut Client, rng: &mut StdRng) -> Result<()>;
method generate (line 192) | fn generate(&self, rng: StdRng) -> Result<impl Iterator<Item = Self::I...
method execute (line 196) | fn execute(client: &mut Client, item: &Self::Item) -> Result<()>;
method verify (line 199) | fn verify(&self, _client: &mut Client, _txns: usize) -> Result<()> {
type Item (line 230) | type Item = HashSet<u64>;
method prepare (line 232) | fn prepare(&self, client: &mut Client, rng: &mut StdRng) -> Result<()> {
method generate (line 253) | fn generate(&self, rng: StdRng) -> Result<impl Iterator<Item = Self::I...
method execute (line 261) | fn execute(client: &mut Client, item: &Self::Item) -> Result<()> {
method verify (line 272) | fn verify(&self, client: &mut Client, _: usize) -> Result<()> {
type Item (line 324) | type Item = Vec<(u64, String)>;
method prepare (line 326) | fn prepare(&self, client: &mut Client, _: &mut StdRng) -> Result<()> {
method generate (line 334) | fn generate(&self, rng: StdRng) -> Result<impl Iterator<Item = Self::I...
method execute (line 338) | fn execute(client: &mut Client, item: &Self::Item) -> Result<()> {
method verify (line 352) | fn verify(&self, client: &mut Client, txns: usize) -> Result<()> {
type Item (line 415) | type Item = (u64, u64, u64);
method prepare (line 417) | fn prepare(&self, client: &mut Client, rng: &mut StdRng) -> Result<()> {
method generate (line 461) | fn generate(&self, rng: StdRng) -> Result<impl Iterator<Item = Self::I...
method execute (line 473) | fn execute(client: &mut Client, item: &Self::Item) -> Result<()> {
method verify (line 518) | fn verify(&self, client: &mut Client, _: usize) -> Result<()> {
type Read (line 209) | struct Read {
method fmt (line 224) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
type ReadGenerator (line 280) | struct ReadGenerator {
type Item (line 287) | type Item = <Read as Workload>::Item;
method next (line 289) | fn next(&mut self) -> Option<Self::Item> {
type Write (line 307) | struct Write {
method fmt (line 318) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
type WriteGenerator (line 361) | struct WriteGenerator {
type Item (line 369) | type Item = <Write as Workload>::Item;
method next (line 371) | fn next(&mut self) -> Option<Self::Item> {
type Bank (line 390) | struct Bank {
method fmt (line 409) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
FILE: src/client.rs
type Client (line 17) | pub struct Client {
method connect (line 28) | pub fn connect(addr: impl ToSocketAddrs) -> Result<Self> {
method request (line 36) | fn request(&mut self, request: Request) -> Result<Response> {
method execute (line 43) | pub fn execute(&mut self, statement: &str) -> Result<StatementResult> {
method get_table (line 59) | pub fn get_table(&mut self, table: &str) -> Result<Table> {
method list_tables (line 67) | pub fn list_tables(&mut self) -> Result<Vec<String>> {
method status (line 75) | pub fn status(&mut self) -> Result<Status> {
method txn (line 83) | pub fn txn(&self) -> Option<&mvcc::TransactionState> {
method with_retry (line 91) | pub fn with_retry<T>(&mut self, f: impl Fn(&mut Client) -> Result<T>) ...
FILE: src/encoding/bincode.rs
constant CONFIG (line 16) | const CONFIG: bincode::config::Configuration = bincode::config::standard();
function serialize (line 19) | pub fn serialize<T: Serialize>(value: &T) -> Vec<u8> {
function deserialize (line 25) | pub fn deserialize<'de, T: Deserialize<'de>>(bytes: &'de [u8]) -> Result...
function serialize_into (line 30) | pub fn serialize_into<W: Write, T: Serialize>(mut writer: W, value: &T) ...
function deserialize_from (line 36) | pub fn deserialize_from<R: Read, T: DeserializeOwned>(mut reader: R) -> ...
function maybe_deserialize_from (line 42) | pub fn maybe_deserialize_from<R: Read, T: DeserializeOwned>(mut reader: ...
FILE: src/encoding/format.rs
type Formatter (line 16) | pub trait Formatter {
method key (line 18) | fn key(key: &[u8]) -> String;
method value (line 21) | fn value(key: &[u8], value: &[u8]) -> String;
method key_value (line 24) | fn key_value(key: &[u8], value: &[u8]) -> String {
method key_maybe_value (line 29) | fn key_maybe_value(key: &[u8], value: Option<&[u8]>) -> String {
method key (line 48) | fn key(key: &[u8]) -> String {
method value (line 52) | fn value(_key: &[u8], value: &[u8]) -> String {
method key (line 69) | fn key(key: &[u8]) -> String {
method value (line 76) | fn value(key: &[u8], value: &[u8]) -> String {
method key (line 108) | fn key(key: &[u8]) -> String {
method value (line 128) | fn value(key: &[u8], value: &[u8]) -> String {
method key (line 174) | fn key(key: &[u8]) -> String {
method value (line 193) | fn value(key: &[u8], value: &[u8]) -> String {
method key (line 231) | fn key(_key: &[u8]) -> String {
method value (line 236) | fn value(_key: &[u8], value: &[u8]) -> String {
type Raw (line 37) | pub struct Raw;
method bytes (line 41) | pub fn bytes(bytes: &[u8]) -> String {
type Raft (line 58) | pub struct Raft<F: Formatter>(PhantomData<F>);
function entry (line 62) | pub fn entry(entry: &raft::Entry) -> String {
type MVCC (line 105) | pub struct MVCC<F: Formatter>(PhantomData<F>);
type SQL (line 157) | pub struct SQL;
method values (line 161) | fn values(values: impl IntoIterator<Item = sql::types::Value>) -> Stri...
method schema (line 166) | fn schema(table: sql::types::Table) -> String {
type SQLCommand (line 228) | pub struct SQLCommand;
FILE: src/encoding/keycode.rs
function serialize (line 58) | pub fn serialize<T: Serialize>(key: &T) -> Vec<u8> {
function deserialize (line 66) | pub fn deserialize<'a, T: Deserialize<'a>>(input: &'a [u8]) -> Result<T> {
function prefix_range (line 85) | pub fn prefix_range(prefix: &[u8]) -> (Bound<Vec<u8>>, Bound<Vec<u8>>) {
type Serializer (line 97) | struct Serializer {
type Ok (line 102) | type Ok = ();
type Error (line 103) | type Error = Error;
type SerializeSeq (line 105) | type SerializeSeq = Self;
type Ok (line 297) | type Ok = ();
type Error (line 298) | type Error = Error;
method serialize_element (line 300) | fn serialize_element<T: Serialize + ?Sized>(&mut self, value: &T) -> R...
method end (line 304) | fn end(self) -> Result<()> {
type SerializeTuple (line 106) | type SerializeTuple = Self;
type Ok (line 311) | type Ok = ();
type Error (line 312) | type Error = Error;
method serialize_element (line 314) | fn serialize_element<T: Serialize + ?Sized>(&mut self, value: &T) -> R...
method end (line 318) | fn end(self) -> Result<()> {
type SerializeTupleVariant (line 107) | type SerializeTupleVariant = Self;
type Ok (line 325) | type Ok = ();
type Error (line 326) | type Error = Error;
method serialize_field (line 328) | fn serialize_field<T: Serialize + ?Sized>(&mut self, value: &T) -> Res...
method end (line 332) | fn end(self) -> Result<()> {
type SerializeTupleStruct (line 108) | type SerializeTupleStruct = Impossible<(), Error>;
type SerializeMap (line 109) | type SerializeMap = Impossible<(), Error>;
type SerializeStruct (line 110) | type SerializeStruct = Impossible<(), Error>;
type SerializeStructVariant (line 111) | type SerializeStructVariant = Impossible<(), Error>;
function serialize_bool (line 114) | fn serialize_bool(self, v: bool) -> Result<()> {
function serialize_i8 (line 119) | fn serialize_i8(self, _: i8) -> Result<()> {
function serialize_i16 (line 123) | fn serialize_i16(self, _: i16) -> Result<()> {
function serialize_i32 (line 127) | fn serialize_i32(self, _: i32) -> Result<()> {
function serialize_i64 (line 138) | fn serialize_i64(self, v: i64) -> Result<()> {
function serialize_u8 (line 145) | fn serialize_u8(self, _: u8) -> Result<()> {
function serialize_u16 (line 149) | fn serialize_u16(self, _: u16) -> Result<()> {
function serialize_u32 (line 153) | fn serialize_u32(self, _: u32) -> Result<()> {
function serialize_u64 (line 158) | fn serialize_u64(self, v: u64) -> Result<()> {
function serialize_f32 (line 163) | fn serialize_f32(self, _: f32) -> Result<()> {
function serialize_f64 (line 171) | fn serialize_f64(self, v: f64) -> Result<()> {
function serialize_char (line 181) | fn serialize_char(self, _: char) -> Result<()> {
function serialize_str (line 186) | fn serialize_str(self, v: &str) -> Result<()> {
function serialize_bytes (line 195) | fn serialize_bytes(self, v: &[u8]) -> Result<()> {
function serialize_none (line 207) | fn serialize_none(self) -> Result<()> {
function serialize_some (line 211) | fn serialize_some<T: Serialize + ?Sized>(self, _: &T) -> Result<()> {
function serialize_unit (line 215) | fn serialize_unit(self) -> Result<()> {
function serialize_unit_struct (line 219) | fn serialize_unit_struct(self, _: &'static str) -> Result<()> {
function serialize_unit_variant (line 224) | fn serialize_unit_variant(self, _: &'static str, index: u32, _: &'static...
function serialize_newtype_struct (line 229) | fn serialize_newtype_struct<T: Serialize + ?Sized>(self, _: &'static str...
function serialize_newtype_variant (line 234) | fn serialize_newtype_variant<T: Serialize + ?Sized>(
function serialize_seq (line 246) | fn serialize_seq(self, _: Option<usize>) -> Result<Self::SerializeSeq> {
function serialize_tuple (line 251) | fn serialize_tuple(self, _: usize) -> Result<Self::SerializeTuple> {
function serialize_tuple_struct (line 255) | fn serialize_tuple_struct(
function serialize_tuple_variant (line 265) | fn serialize_tuple_variant(
function serialize_map (line 276) | fn serialize_map(self, _: Option<usize>) -> Result<Self::SerializeMap> {
function serialize_struct (line 280) | fn serialize_struct(self, _: &'static str, _: usize) -> Result<Self::Ser...
function serialize_struct_variant (line 284) | fn serialize_struct_variant(
type Deserializer (line 340) | pub struct Deserializer<'de> {
function from_bytes (line 346) | pub fn from_bytes(input: &'de [u8]) -> Self {
function take_bytes (line 352) | fn take_bytes(&mut self, len: usize) -> Result<&[u8]> {
function decode_next_bytes (line 362) | fn decode_next_bytes(&mut self) -> Result<Vec<u8>> {
type Error (line 383) | type Error = Error;
function deserialize_any (line 385) | fn deserialize_any<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
function deserialize_bool (line 389) | fn deserialize_bool<V: Visitor<'de>>(self, visitor: V) -> Result<V::Valu...
function deserialize_i8 (line 397) | fn deserialize_i8<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
function deserialize_i16 (line 401) | fn deserialize_i16<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
function deserialize_i32 (line 405) | fn deserialize_i32<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
function deserialize_i64 (line 409) | fn deserialize_i64<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
function deserialize_u8 (line 415) | fn deserialize_u8<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
function deserialize_u16 (line 419) | fn deserialize_u16<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
function deserialize_u32 (line 423) | fn deserialize_u32<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
function deserialize_u64 (line 427) | fn deserialize_u64<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
function deserialize_f32 (line 431) | fn deserialize_f32<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
function deserialize_f64 (line 435) | fn deserialize_f64<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
function deserialize_char (line 445) | fn deserialize_char<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
function deserialize_str (line 449) | fn deserialize_str<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
function deserialize_string (line 454) | fn deserialize_string<V: Visitor<'de>>(self, visitor: V) -> Result<V::Va...
function deserialize_bytes (line 459) | fn deserialize_bytes<V: Visitor<'de>>(self, visitor: V) -> Result<V::Val...
function deserialize_byte_buf (line 464) | fn deserialize_byte_buf<V: Visitor<'de>>(self, visitor: V) -> Result<V::...
function deserialize_option (line 469) | fn deserialize_option<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
function deserialize_unit (line 473) | fn deserialize_unit<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
function deserialize_unit_struct (line 477) | fn deserialize_unit_struct<V: Visitor<'de>>(self, _: &'static str, _: V)...
function deserialize_newtype_struct (line 481) | fn deserialize_newtype_struct<V: Visitor<'de>>(
function deserialize_seq (line 489) | fn deserialize_seq<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
function deserialize_tuple (line 493) | fn deserialize_tuple<V: Visitor<'de>>(self, _: usize, visitor: V) -> Res...
function deserialize_tuple_struct (line 497) | fn deserialize_tuple_struct<V: Visitor<'de>>(
function deserialize_map (line 506) | fn deserialize_map<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
function deserialize_struct (line 510) | fn deserialize_struct<V: Visitor<'de>>(
function deserialize_enum (line 519) | fn deserialize_enum<V: Visitor<'de>>(
function deserialize_identifier (line 528) | fn deserialize_identifier<V: Visitor<'de>>(self, _: V) -> Result<V::Valu...
function deserialize_ignored_any (line 532) | fn deserialize_ignored_any<V: Visitor<'de>>(self, _: V) -> Result<V::Val...
type Error (line 539) | type Error = Error;
function next_element_seed (line 541) | fn next_element_seed<T: DeserializeSeed<'de>>(&mut self, seed: T) -> Res...
type Error (line 551) | type Error = Error;
type Variant (line 552) | type Variant = Self;
function variant_seed (line 554) | fn variant_seed<V: DeserializeSeed<'de>>(self, seed: V) -> Result<(V::Va...
type Error (line 563) | type Error = Error;
function unit_variant (line 565) | fn unit_variant(self) -> Result<()> {
function newtype_variant_seed (line 569) | fn newtype_variant_seed<T: DeserializeSeed<'de>>(self, seed: T) -> Resul...
function tuple_variant (line 573) | fn tuple_variant<V: Visitor<'de>>(self, _: usize, visitor: V) -> Result<...
function struct_variant (line 577) | fn struct_variant<V: Visitor<'de>>(self, _: &'static [&'static str], _: ...
type Key (line 595) | enum Key<'a> {
FILE: src/encoding/mod.rs
type Key (line 22) | pub trait Key<'de>: Serialize + Deserialize<'de> {
method decode (line 24) | fn decode(bytes: &'de [u8]) -> Result<Self> {
method encode (line 34) | fn encode(&self) -> Vec<u8> {
type Value (line 42) | pub trait Value: Serialize + DeserializeOwned {
method decode (line 44) | fn decode(bytes: &[u8]) -> Result<Self> {
method decode_from (line 49) | fn decode_from<R: Read>(reader: R) -> Result<Self> {
method maybe_decode_from (line 55) | fn maybe_decode_from<R: Read>(reader: R) -> Result<Option<Self>> {
method encode (line 60) | fn encode(&self) -> Vec<u8> {
method encode_into (line 65) | fn encode_into<W: Write>(&self, writer: W) -> Result<()> {
FILE: src/error.rs
type Error (line 7) | pub enum Error {
method is_deterministic (line 46) | pub fn is_deterministic(&self) -> bool {
method custom (line 89) | fn custom<T: Display>(msg: T) -> Self {
method custom (line 95) | fn custom<T: Display>(msg: T) -> Self {
method from (line 101) | fn from(err: bincode::error::DecodeError) -> Self {
method from (line 107) | fn from(err: bincode::error::EncodeError) -> Self {
method from (line 113) | fn from(err: config::ConfigError) -> Self {
method from (line 119) | fn from(err: crossbeam::channel::RecvError) -> Self {
method from (line 125) | fn from(err: crossbeam::channel::SendError<T>) -> Self {
method from (line 131) | fn from(err: crossbeam::channel::TryRecvError) -> Self {
method from (line 137) | fn from(err: crossbeam::channel::TrySendError<T>) -> Self {
method from (line 143) | fn from(err: hdrhistogram::CreationError) -> Self {
method from (line 149) | fn from(err: hdrhistogram::RecordError) -> Self {
method from (line 155) | fn from(err: log::ParseLevelError) -> Self {
method from (line 161) | fn from(err: log::SetLoggerError) -> Self {
method from (line 167) | fn from(err: rand::distr::uniform::Error) -> Self {
method from (line 173) | fn from(err: regex::Error) -> Self {
method from (line 179) | fn from(err: rustyline::error::ReadlineError) -> Self {
method from (line 185) | fn from(err: std::array::TryFromSliceError) -> Self {
method from (line 191) | fn from(err: std::io::Error) -> Self {
method from (line 197) | fn from(err: std::num::ParseFloatError) -> Self {
method from (line 203) | fn from(err: std::num::ParseIntError) -> Self {
method from (line 209) | fn from(err: std::num::TryFromIntError) -> Self {
method from (line 215) | fn from(err: std::string::FromUtf8Error) -> Self {
method from (line 221) | fn from(err: std::sync::PoisonError<T>) -> Self {
method fmt (line 28) | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
type Result (line 80) | pub type Result<T> = std::result::Result<T, Error>;
function from (line 83) | fn from(error: Error) -> Self {
FILE: src/raft/log.rs
type Index (line 11) | pub type Index = u64;
type Entry (line 15) | pub struct Entry {
type Key (line 32) | pub enum Key {
type Log (line 92) | pub struct Log {
method new (line 120) | pub fn new(mut engine: Box<dyn storage::Engine>) -> Result<Self> {
method enable_fsync (line 150) | pub fn enable_fsync(&mut self, fsync: bool) {
method get_commit_index (line 155) | pub fn get_commit_index(&self) -> (Index, Term) {
method get_last_index (line 160) | pub fn get_last_index(&self) -> (Index, Term) {
method get_term_vote (line 165) | pub fn get_term_vote(&self) -> (Term, Option<NodeID>) {
method set_term_vote (line 172) | pub fn set_term_vote(&mut self, term: Term, vote: Option<NodeID>) -> R...
method append (line 193) | pub fn append(&mut self, command: Option<Vec<u8>>) -> Result<Index> {
method commit (line 207) | pub fn commit(&mut self, index: Index) -> Result<Index> {
method get (line 225) | pub fn get(&mut self, index: Index) -> Result<Option<Entry>> {
method has (line 230) | pub fn has(&mut self, index: Index, term: Term) -> Result<bool> {
method scan (line 243) | pub fn scan(&mut self, range: impl RangeBounds<Index>) -> Iterator<'_> {
method scan_apply (line 259) | pub fn scan_apply(&mut self, applied_index: Index) -> Iterator<'_> {
method splice (line 278) | pub fn splice(&mut self, entries: Vec<Entry>) -> Result<Index> {
method status (line 346) | pub fn status(&mut self) -> Result<storage::Status> {
type Iterator (line 352) | pub struct Iterator<'a> {
function new (line 357) | fn new(inner: Box<dyn storage::ScanIterator + 'a>) -> Self {
type Item (line 363) | type Item = Result<Entry>;
function next (line 365) | fn next(&mut self) -> Option<Self::Item> {
function test_goldenscript (line 390) | fn test_goldenscript(path: &std::path::Path) {
type TestRunner (line 395) | struct TestRunner {
method new (line 403) | fn new() -> Self {
method parse_index_term (line 417) | fn parse_index_term(s: &str) -> Result<(Index, Term), Box<dyn Error>> {
method parse_index_range (line 426) | fn parse_index_range(s: &str) -> Result<impl RangeBounds<Index>, Box<d...
method run (line 447) | fn run(&mut self, command: &goldenscript::Command) -> Result<String, B...
method end_command (line 639) | fn end_command(&mut self, _: &goldenscript::Command) -> Result<String,...
FILE: src/raft/message.rs
type Envelope (line 12) | pub struct Envelope {
type Message (line 33) | pub enum Message {
type RequestID (line 159) | pub type RequestID = uuid::Uuid;
type ReadSequence (line 162) | pub type ReadSequence = u64;
type Request (line 166) | pub enum Request {
type Response (line 181) | pub enum Response {
type Status (line 194) | pub struct Status {
FILE: src/raft/mod.rs
constant TICK_INTERVAL (line 256) | pub const TICK_INTERVAL: Duration = Duration::from_millis(100);
constant HEARTBEAT_INTERVAL (line 259) | const HEARTBEAT_INTERVAL: Ticks = 4;
constant ELECTION_TIMEOUT_RANGE (line 263) | const ELECTION_TIMEOUT_RANGE: Range<Ticks> = 10..20;
constant MAX_APPEND_ENTRIES (line 266) | const MAX_APPEND_ENTRIES: usize = 100;
FILE: src/raft/node.rs
type NodeID (line 18) | pub type NodeID = u8;
type Term (line 21) | pub type Term = u64;
type Ticks (line 24) | pub type Ticks = u8;
type Options (line 28) | pub struct Options {
method default (line 38) | fn default() -> Self {
type Node (line 59) | pub enum Node {
method new (line 73) | pub fn new(
method id (line 90) | pub fn id(&self) -> NodeID {
method term (line 99) | pub fn term(&self) -> Term {
method step (line 108) | pub fn step(self, msg: Envelope) -> Result<Self> {
method tick (line 126) | pub fn tick(self) -> Result<Self> {
method from (line 136) | fn from(node: RawNode<Candidate>) -> Self {
method from (line 142) | fn from(node: RawNode<Follower>) -> Self {
method from (line 148) | fn from(node: RawNode<Leader>) -> Self {
method dismantle (line 1261) | fn dismantle(self) -> (Log, Box<dyn State>) {
method get_applied_index (line 1265) | fn get_applied_index(&self) -> Index {
method get_commit_index (line 1269) | fn get_commit_index(&self) -> (Index, Term) {
method get_last_index (line 1273) | fn get_last_index(&self) -> (Index, Term) {
method get_term_vote (line 1277) | fn get_term_vote(&self) -> (Term, Option<NodeID>) {
method options (line 1281) | fn options(&self) -> Options {
method peers (line 1285) | fn peers(&self) -> HashSet<NodeID> {
method read (line 1289) | fn read(&self, command: Vec<u8>) -> crate::error::Result<Vec<u8>> {
method scan_log (line 1293) | fn scan_log(&mut self) -> crate::error::Result<Vec<Entry>> {
type Role (line 154) | pub trait Role {}
type RawNode (line 160) | pub struct RawNode<R: Role> {
function into_role (line 181) | fn into_role<T: Role>(self, role: T) -> RawNode<T> {
function term (line 194) | fn term(&self) -> Term {
function cluster_size (line 199) | fn cluster_size(&self) -> usize {
function quorum_size (line 204) | fn quorum_size(&self) -> usize {
function quorum_value (line 210) | fn quorum_value<T: Ord + Copy>(&self, mut values: Vec<T>) -> T {
function random_election_timeout (line 216) | fn random_election_timeout(&self) -> Ticks {
function send (line 221) | fn send(&self, to: NodeID, message: Message) -> Result<()> {
function send_via (line 227) | fn send_via(tx: &Sender<Envelope>, msg: Envelope) -> Result<()> {
function broadcast (line 233) | fn broadcast(&self, message: Message) -> Result<()> {
type Follower (line 245) | pub struct Follower {
method new (line 259) | fn new(leader: Option<NodeID>, election_timeout: Ticks) -> Self {
function new (line 268) | fn new(
function into_candidate (line 294) | fn into_candidate(mut self) -> Result<RawNode<Candidate>> {
function into_follower (line 316) | fn into_follower(mut self, term: Term, leader: Option<NodeID>) -> Result...
function step (line 341) | fn step(mut self, msg: Envelope) -> Result<Node> {
function tick (line 490) | fn tick(mut self) -> Result<Node> {
function abort_forwarded (line 500) | fn abort_forwarded(&mut self) -> Result<()> {
function maybe_apply (line 510) | fn maybe_apply(&mut self) -> Result<()> {
type Candidate (line 524) | pub struct Candidate {
method new (line 535) | fn new(election_timeout: Ticks) -> Self {
function into_follower (line 546) | fn into_follower(mut self, term: Term, leader: Option<NodeID>) -> Result...
function into_leader (line 564) | fn into_leader(self) -> Result<RawNode<Leader>> {
function step (line 586) | fn step(mut self, msg: Envelope) -> Result<Node> {
function tick (line 638) | fn tick(mut self) -> Result<Node> {
function campaign (line 649) | fn campaign(&mut self) -> Result<()> {
type Leader (line 663) | pub struct Leader {
method new (line 754) | fn new(peers: HashSet<NodeID>, last_index: Index) -> Self {
type Progress (line 683) | struct Progress {
method advance (line 703) | fn advance(&mut self, match_index: Index) -> bool {
method advance_read (line 713) | fn advance_read(&mut self, read_seq: ReadSequence) -> bool {
method regress_next (line 723) | fn regress_next(&mut self, next_index: Index) -> bool {
type Write (line 733) | struct Write {
type Read (line 741) | struct Read {
function into_follower (line 776) | fn into_follower(mut self, term: Term) -> Result<RawNode<Follower>> {
function step (line 797) | fn step(mut self, msg: Envelope) -> Result<Node> {
function tick (line 946) | fn tick(mut self) -> Result<Node> {
function heartbeat (line 956) | fn heartbeat(&mut self) -> Result<()> {
function propose (line 969) | fn propose(&mut self, command: Option<Vec<u8>>) -> Result<Index> {
function maybe_commit_and_apply (line 984) | fn maybe_commit_and_apply(&mut self) -> Result<Index> {
function maybe_read (line 1036) | fn maybe_read(&mut self) -> Result<()> {
function maybe_send_append (line 1081) | fn maybe_send_append(&mut self, peer: NodeID, mut probe: bool) -> Result...
function status (line 1131) | fn status(&mut self) -> Result<Status> {
function progress (line 1149) | fn progress(&mut self, id: NodeID) -> &mut Progress {
function test_goldenscript (line 1179) | fn test_goldenscript(path: &Path) {
function quorum_size (line 1192) | fn quorum_size(size: usize) -> usize {
function quorum_value (line 1204) | fn quorum_value(values: Vec<i8>) -> i8 {
function new_noop (line 1214) | fn new_noop(id: NodeID, peers: HashSet<NodeID>) -> Self {
type TestRunner (line 1299) | struct TestRunner {
method run (line 1323) | fn run(&mut self, command: &goldenscript::Command) -> Result<String, B...
method new (line 1497) | fn new() -> Self {
method add_node (line 1512) | fn add_node(
method add_node_with (line 1530) | fn add_node_with(
method campaign (line 1550) | fn campaign(&mut self, ids: &[NodeID], output: &mut String) -> Result<...
method cluster (line 1569) | fn cluster(
method deliver (line 1613) | fn deliver(
method heal (line 1643) | fn heal(&mut self, ids: &[NodeID], output: &mut String) -> Result<(), ...
method heartbeat (line 1655) | fn heartbeat(&mut self, ids: &[NodeID], output: &mut String) -> Result...
method log (line 1667) | fn log(&mut self, ids: &[NodeID], output: &mut String) -> Result<(), B...
method partition (line 1688) | fn partition(&mut self, ids: &[NodeID], output: &mut String) -> Result...
method receive (line 1702) | fn receive(&mut self, id: NodeID, output: &mut String) -> Result<u32, ...
method request (line 1744) | fn request(
method restart (line 1767) | fn restart(
method stabilize (line 1825) | fn stabilize(
method state (line 1850) | fn state(&mut self, ids: &[NodeID], output: &mut String) -> Result<(),...
method status (line 1868) | fn status(&self, ids: &[NodeID], output: &mut String) -> Result<(), Bo...
method transition (line 1896) | fn transition(
method parse_ids (line 1945) | fn parse_ids<A>(&self, args: &[A]) -> Result<Vec<NodeID>, Box<dyn Error>>
method parse_ids_or_all (line 1965) | fn parse_ids_or_all<A>(&self, args: &[A]) -> Result<Vec<NodeID>, Box<d...
method parse_ids_or_error (line 1977) | fn parse_ids_or_error<A>(&self, args: &[A]) -> Result<Vec<NodeID>, Box...
method format_disconnected (line 1989) | fn format_disconnected(disconnected: &HashMap<NodeID, HashSet<NodeID>>...
method format_entry (line 2065) | fn format_entry(entry: &Entry) -> String {
method format_message (line 2074) | fn format_message(msg: &Message) -> String {
method format_node (line 2135) | fn format_node(node: &Node) -> String {
method format_node_role (line 2140) | fn format_node_role(node: &Node) -> String {
method format_request (line 2153) | fn format_request(request: &Request) -> String {
method format_response (line 2161) | fn format_response(response: &crate::error::Result<Response>) -> String {
method format_strikethrough (line 2172) | fn format_strikethrough(s: &str) -> String {
FILE: src/raft/state.rs
type State (line 19) | pub trait State: Send {
method get_applied_index (line 26) | fn get_applied_index(&self) -> Index;
method apply (line 43) | fn apply(&mut self, entry: Entry) -> Result<Vec<u8>>;
method read (line 50) | fn read(&self, command: Vec<u8>) -> Result<Vec<u8>>;
method get_applied_index (line 79) | fn get_applied_index(&self) -> Index {
method apply (line 83) | fn apply(&mut self, entry: Entry) -> Result<Vec<u8>> {
method read (line 89) | fn read(&self, command: Vec<u8>) -> Result<Vec<u8>> {
method get_applied_index (line 107) | fn get_applied_index(&self) -> Index {
method apply (line 111) | fn apply(&mut self, entry: Entry) -> Result<Vec<u8>> {
method read (line 127) | fn read(&self, command: Vec<u8>) -> Result<Vec<u8>> {
method get_applied_index (line 199) | fn get_applied_index(&self) -> Index {
method apply (line 203) | fn apply(&mut self, entry: Entry) -> Result<Vec<u8>> {
method read (line 208) | fn read(&self, _: Vec<u8>) -> Result<Vec<u8>> {
type Emit (line 67) | pub struct Emit {
method new (line 73) | pub fn new(inner: Box<dyn State>, tx: Sender<Entry>) -> Box<Self> {
type KV (line 95) | pub struct KV {
method new (line 101) | pub fn new() -> Box<Self> {
type KVCommand (line 140) | pub enum KVCommand {
method fmt (line 152) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
type KVResponse (line 163) | pub enum KVResponse {
method fmt (line 175) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
type Noop (line 188) | pub struct Noop {
method new (line 193) | pub fn new() -> Box<Self> {
FILE: src/server.rs
constant RAFT_PEER_CHANNEL_CAPACITY (line 22) | const RAFT_PEER_CHANNEL_CAPACITY: usize = 1000;
constant RAFT_PEER_RETRY_INTERVAL (line 25) | const RAFT_PEER_RETRY_INTERVAL: Duration = Duration::from_secs(1);
type Server (line 37) | pub struct Server {
method new (line 48) | pub fn new(
method serve (line 67) | pub fn serve(self, raft_addr: impl ToSocketAddrs, sql_addr: impl ToSoc...
method raft_accept (line 114) | fn raft_accept(listener: TcpListener, raft_step_tx: Sender<raft::Envel...
method raft_receive_peer (line 138) | fn raft_receive_peer(socket: TcpStream, raft_step_tx: Sender<raft::Env...
method raft_send_peer (line 148) | fn raft_send_peer(addr: String, raft_node_rx: Receiver<raft::Envelope>) {
method raft_route (line 186) | fn raft_route(
method sql_accept (line 252) | fn sql_accept(id: raft::NodeID, listener: TcpListener, sql_engine: sql...
method sql_session (line 276) | fn sql_session(
type Request (line 314) | pub enum Request {
type Response (line 329) | pub enum Response {
type Status (line 341) | pub struct Status {
FILE: src/sql/engine/engine.rs
type Engine (line 13) | pub trait Engine<'a>: Sized {
method begin (line 19) | fn begin(&'a self) -> Result<Self::Transaction>;
method begin_read_only (line 21) | fn begin_read_only(&'a self) -> Result<Self::Transaction>;
method begin_as_of (line 23) | fn begin_as_of(&'a self, version: mvcc::Version) -> Result<Self::Trans...
method session (line 26) | fn session(&'a self) -> Session<'a, Self> {
type Transaction (line 37) | pub trait Transaction: Catalog {
method state (line 39) | fn state(&self) -> &mvcc::TransactionState;
method commit (line 42) | fn commit(self) -> Result<()>;
method rollback (line 44) | fn rollback(self) -> Result<()>;
method delete (line 47) | fn delete(&self, table: &str, ids: &[Value]) -> Result<()>;
method get (line 49) | fn get(&self, table: &str, ids: &[Value]) -> Result<Vec<Row>>;
method insert (line 51) | fn insert(&self, table: &str, rows: Vec<Row>) -> Result<()>;
method lookup_index (line 53) | fn lookup_index(&self, table: &str, column: &str, values: &[Value]) ->...
method scan (line 55) | fn scan(&self, table: &str, filter: Option<Expression>) -> Result<Rows>;
method update (line 57) | fn update(&self, table: &str, rows: BTreeMap<Value, Row>) -> Result<()>;
type Catalog (line 64) | pub trait Catalog {
method create_table (line 66) | fn create_table(&self, table: Table) -> Result<()>;
method drop_table (line 69) | fn drop_table(&self, table: &str, if_exists: bool) -> Result<bool>;
method get_table (line 71) | fn get_table(&self, table: &str) -> Result<Option<Table>>;
method list_tables (line 73) | fn list_tables(&self) -> Result<Vec<Table>>;
method must_get_table (line 76) | fn must_get_table(&self, table: &str) -> Result<Table> {
FILE: src/sql/engine/local.rs
type Key (line 22) | pub enum Key<'a> {
type KeyPrefix (line 39) | enum KeyPrefix<'a> {
type Local (line 53) | pub struct Local<E: storage::Engine + 'static> {
function new (line 60) | pub fn new(engine: E) -> Self {
function resume (line 68) | pub fn resume(&self, state: mvcc::TransactionState) -> Result<Transactio...
function get_unversioned (line 73) | pub fn get_unversioned(&self, key: &[u8]) -> Result<Option<Vec<u8>>> {
function set_unversioned (line 78) | pub fn set_unversioned(&self, key: &[u8], value: Vec<u8>) -> Result<()> {
type Transaction (line 84) | type Transaction = Transaction<E>;
function begin (line 86) | fn begin(&self) -> Result<Self::Transaction> {
function begin_read_only (line 90) | fn begin_read_only(&self) -> Result<Self::Transaction> {
function begin_as_of (line 94) | fn begin_as_of(&self, version: mvcc::Version) -> Result<Self::Transactio...
type Transaction (line 100) | pub struct Transaction<E: storage::Engine + 'static> {
function new (line 106) | fn new(txn: mvcc::Transaction<E>) -> Self {
function state (line 111) | pub fn state(&self) -> &mvcc::TransactionState {
function get_index (line 117) | fn get_index(&self, table: &str, column: &str, value: &Value) -> Result<...
function get_row (line 128) | fn get_row(&self, table: &str, id: &Value) -> Result<Option<Row>> {
function has_index (line 136) | fn has_index(&self, table: &str, column: &str) -> Result<bool> {
function set_index (line 143) | fn set_index(
function table_references (line 162) | fn table_references(&self, table: &str) -> Result<Vec<(Table, Vec<usize>...
function state (line 182) | fn state(&self) -> &mvcc::TransactionState {
function commit (line 186) | fn commit(self) -> Result<()> {
function rollback (line 190) | fn rollback(self) -> Result<()> {
function delete (line 194) | fn delete(&self, table: &str, ids: &[Value]) -> Result<()> {
function get (line 248) | fn get(&self, table: &str, ids: &[Value]) -> Result<Vec<Row>> {
function insert (line 252) | fn insert(&self, table: &str, rows: Vec<Row>) -> Result<()> {
function lookup_index (line 270) | fn lookup_index(&self, table: &str, column: &str, values: &[Value]) -> R...
function scan (line 275) | fn scan(&self, table: &str, filter: Option<Expression>) -> Result<Rows> {
function update (line 296) | fn update(&self, table: &str, rows: BTreeMap<Value, Row>) -> Result<()> {
method create_table (line 341) | fn create_table(&self, table: Table) -> Result<()> {
method drop_table (line 349) | fn drop_table(&self, table: &str, if_exists: bool) -> Result<bool> {
method get_table (line 390) | fn get_table(&self, table: &str) -> Result<Option<Table>> {
method list_tables (line 394) | fn list_tables(&self) -> Result<Vec<Table>> {
FILE: src/sql/engine/raft.rs
type Read (line 20) | pub enum Read<'a> {
type Write (line 58) | pub enum Write<'a> {
type Status (line 75) | pub struct Status {
type Raft (line 92) | pub struct Raft {
constant APPLIED_INDEX_KEY (line 100) | pub const APPLIED_INDEX_KEY: &'static [u8] = b"applied_index";
method new (line 104) | pub fn new(tx: Sender<(raft::Request, Sender<Result<raft::Response>>)>...
method new_state (line 110) | pub fn new_state<E: storage::Engine>(engine: E) -> Result<State<E>> {
method request (line 115) | fn request(&self, request: raft::Request) -> Result<raft::Response> {
method write (line 122) | fn write<V: DeserializeOwned>(&self, write: Write) -> Result<V> {
method read (line 130) | fn read<V: DeserializeOwned>(&self, read: Read) -> Result<V> {
method status (line 138) | pub fn status(&self) -> Result<Status> {
type Transaction (line 149) | type Transaction = Transaction<'a>;
method begin (line 151) | fn begin(&'a self) -> Result<Self::Transaction> {
method begin_read_only (line 155) | fn begin_read_only(&'a self) -> Result<Self::Transaction> {
method begin_as_of (line 159) | fn begin_as_of(&'a self, version: mvcc::Version) -> Result<Self::Trans...
type Transaction (line 171) | pub struct Transaction<'a> {
function begin (line 180) | fn begin(raft: &'a Raft, read_only: bool, as_of: Option<mvcc::Version>) ...
function state (line 195) | fn state(&self) -> &mvcc::TransactionState {
function commit (line 199) | fn commit(self) -> Result<()> {
function rollback (line 206) | fn rollback(self) -> Result<()> {
function delete (line 213) | fn delete(&self, table: &str, ids: &[Value]) -> Result<()> {
function get (line 221) | fn get(&self, table: &str, ids: &[Value]) -> Result<Vec<Row>> {
function insert (line 229) | fn insert(&self, table: &str, rows: Vec<Row>) -> Result<()> {
function lookup_index (line 233) | fn lookup_index(&self, table: &str, column: &str, values: &[Value]) -> R...
function scan (line 242) | fn scan(&self, table: &str, filter: Option<Expression>) -> Result<Rows> {
function update (line 251) | fn update(&self, table: &str, rows: BTreeMap<Value, Row>) -> Result<()> {
method create_table (line 257) | fn create_table(&self, schema: Table) -> Result<()> {
method drop_table (line 261) | fn drop_table(&self, table: &str, if_exists: bool) -> Result<bool> {
method get_table (line 269) | fn get_table(&self, table: &str) -> Result<Option<Table>> {
method list_tables (line 273) | fn list_tables(&self) -> Result<Vec<Table>> {
type State (line 286) | pub struct State<E: storage::Engine + 'static> {
function new (line 296) | pub fn new(engine: E) -> Result<Self> {
function write (line 311) | fn write(&self, command: Write) -> Result<Vec<u8>> {
function get_applied_index (line 342) | fn get_applied_index(&self) -> raft::Index {
function apply (line 346) | fn apply(&mut self, entry: raft::Entry) -> Result<Vec<u8>> {
function read (line 369) | fn read(&self, command: Vec<u8>) -> Result<Vec<u8>> {
FILE: src/sql/execution/aggregator.rs
type Aggregator (line 13) | pub struct Aggregator {
method new (line 24) | pub fn new(group_by: Vec<Expression>, aggregates: Vec<Aggregate>) -> S...
method add (line 29) | pub fn add(&mut self, row: &Row) -> Result<()> {
method add_rows (line 51) | pub fn add_rows(&mut self, rows: Rows) -> Result<()> {
method into_rows (line 59) | pub fn into_rows(self) -> Rows {
type Accumulator (line 85) | enum Accumulator {
method new (line 95) | fn new(aggregate: &Aggregate) -> Self {
method add (line 106) | fn add(&mut self, value: Value) -> Result<()> {
method value (line 127) | fn value(self) -> Result<Value> {
FILE: src/sql/execution/executor.rs
type Executor (line 46) | pub struct Executor<'a, T: Transaction> {
function new (line 53) | pub fn new(txn: &'a T) -> Self {
function execute (line 58) | pub fn execute(&mut self, plan: Plan) -> Result<ExecutionResult> {
function execute_node (line 104) | fn execute_node(&mut self, node: Node) -> Result<Rows> {
function delete (line 216) | fn delete(&self, table: &str, primary_key: usize, source: Rows) -> Resul...
function insert (line 230) | fn insert(
function update (line 277) | fn update(
function order (line 299) | fn order(source: Rows, order: Vec<(Expression, Direction)>) -> Result<Ro...
type ExecutionResult (line 332) | pub enum ExecutionResult {
FILE: src/sql/execution/join.rs
type NestedLoopJoiner (line 21) | pub struct NestedLoopJoiner {
method new (line 41) | pub fn new(
method try_next (line 54) | fn try_next(&mut self) -> Result<Option<Row>> {
type Item (line 96) | type Item = Result<Row>;
method next (line 98) | fn next(&mut self) -> Option<Self::Item> {
type HashJoiner (line 112) | pub struct HashJoiner {
method new (line 129) | pub fn new(
method try_next (line 153) | fn try_next(&mut self) -> Result<Option<Row>> {
type Item (line 186) | type Item = Result<Row>;
method next (line 188) | fn next(&mut self) -> Option<Self::Item> {
FILE: src/sql/execution/session.rs
type Session (line 16) | pub struct Session<'a, E: Engine<'a>> {
function new (line 25) | pub fn new(engine: &'a E) -> Self {
function execute (line 30) | pub fn execute(&mut self, statement: &str) -> Result<StatementResult> {
function with_txn (line 90) | pub fn with_txn<F, T>(&mut self, read_only: bool, f: F) -> Result<T>
function status (line 117) | pub fn status(&self) -> Result<Status> {
method drop (line 124) | fn drop(&mut self) {
type StatementResult (line 134) | pub enum StatementResult {
type Error (line 152) | type Error = Error;
method try_from (line 154) | fn try_from(result: ExecutionResult) -> Result<Self> {
type Error (line 171) | type Error = Error;
method try_from (line 173) | fn try_from(result: StatementResult) -> Result<Self> {
type Error (line 183) | type Error = Error;
method try_from (line 185) | fn try_from(result: StatementResult) -> Result<Self> {
type Error (line 193) | type Error = Error;
method try_from (line 195) | fn try_from(result: StatementResult) -> Result<Self> {
type Error (line 203) | type Error = Error;
function try_from (line 205) | fn try_from(result: StatementResult) -> Result<Self> {
type Error (line 212) | type Error = Error;
function try_from (line 214) | fn try_from(result: StatementResult) -> Result<Self> {
type Error (line 221) | type Error = Error;
function try_from (line 223) | fn try_from(result: StatementResult) -> Result<Self> {
type Error (line 230) | type Error = Error;
method try_from (line 232) | fn try_from(result: StatementResult) -> Result<Self> {
FILE: src/sql/mod.rs
function test_goldenscript (line 108) | fn test_goldenscript(path: &Path) {
function test_goldenscript_expr (line 125) | fn test_goldenscript_expr(path: &Path) {
type SQLRunner (line 130) | struct SQLRunner<'a> {
type TestEngine (line 136) | type TestEngine =
function new (line 140) | fn new(engine: &'a TestEngine, op_rx: Receiver<testengine::Operation>) -...
function run (line 146) | fn run(&mut self, command: &goldenscript::Command) -> Result<String, Box...
function end_command (line 273) | fn end_command(&mut self, _: &goldenscript::Command) -> Result<String, B...
type ExpressionRunner (line 281) | struct ExpressionRunner;
method run (line 286) | fn run(&mut self, command: &goldenscript::Command) -> Result<String, B...
type Catalog (line 283) | type Catalog<'a> = <Local<storage::Memory> as Engine<'a>>::Transaction;
FILE: src/sql/parser/ast.rs
type Statement (line 12) | pub enum Statement {
type From (line 85) | pub enum From {
type Column (line 108) | pub struct Column {
type JoinType (line 121) | pub enum JoinType {
method is_outer (line 131) | pub fn is_outer(&self) -> bool {
type Direction (line 141) | pub enum Direction {
type Expression (line 149) | pub enum Expression {
method walk (line 239) | pub fn walk(&self, visitor: &mut impl FnMut(&Expression) -> bool) -> b...
method contains (line 277) | pub fn contains(&self, visitor: &impl Fn(&Expression) -> bool) -> bool {
method collect (line 283) | pub fn collect(&self, visitor: &impl Fn(&Expression) -> bool, exprs: &...
method from (line 324) | fn from(literal: Literal) -> Self {
method from (line 330) | fn from(op: Operator) -> Self {
type Literal (line 164) | pub enum Literal {
method eq (line 177) | fn eq(&self, other: &Self) -> bool {
method hash (line 192) | fn hash<H: Hasher>(&self, state: &mut H) {
type Operator (line 210) | pub enum Operator {
function from (line 336) | fn from(value: Operator) -> Self {
FILE: src/sql/parser/lexer.rs
type Token (line 17) | pub enum Token {
method from (line 81) | fn from(keyword: Keyword) -> Self {
method fmt (line 50) | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
type Keyword (line 88) | pub enum Keyword {
type Error (line 159) | type Error = &'static str;
method try_from (line 161) | fn try_from(value: &str) -> std::result::Result<Self, Self::Error> {
method fmt (line 238) | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
type Lexer (line 315) | pub struct Lexer<'a> {
type Item (line 321) | type Item = Result<Token>;
method next (line 323) | fn next(&mut self) -> Option<Result<Token>> {
function new (line 335) | pub fn new(input: &'a str) -> Lexer<'a> {
function next_if (line 340) | fn next_if(&mut self, predicate: impl Fn(char) -> bool) -> Option<char> {
function next_if_map (line 347) | fn next_if_map<T>(&mut self, map: impl Fn(char) -> Option<T>) -> Option<...
function next_is (line 354) | fn next_is(&mut self, c: char) -> bool {
function scan (line 359) | fn scan(&mut self) -> Result<Option<Token>> {
function scan_ident_or_keyword (line 377) | fn scan_ident_or_keyword(&mut self) -> Option<Token> {
function scan_ident_quoted (line 391) | fn scan_ident_quoted(&mut self) -> Result<Option<Token>> {
function scan_number (line 409) | fn scan_number(&mut self) -> Option<Token> {
function scan_string (line 436) | fn scan_string(&mut self) -> Result<Option<Token>> {
function scan_symbol (line 454) | fn scan_symbol(&mut self) -> Option<Token> {
function skip_whitespace (line 488) | fn skip_whitespace(&mut self) {
function is_ident (line 494) | pub fn is_ident(ident: &str) -> bool {
FILE: src/sql/parser/parser.rs
type Parser (line 17) | pub struct Parser<'a> {
function parse (line 24) | pub fn parse(statement: &str) -> Result<ast::Statement> {
function parse_expr (line 37) | pub fn parse_expr(expr: &str) -> Result<ast::Expression> {
function new (line 47) | fn new(input: &str) -> Parser<'_> {
function next (line 52) | fn next(&mut self) -> Result<Token> {
function next_ident (line 57) | fn next_ident(&mut self) -> Result<String> {
function next_if (line 65) | fn next_if(&mut self, predicate: impl Fn(&Token) -> bool) -> Option<Toke...
function next_if_map (line 72) | fn next_if_map<T>(&mut self, f: impl Fn(&Token) -> Option<T>) -> Option<...
function next_if_keyword (line 77) | fn next_if_keyword(&mut self) -> Option<Keyword> {
function next_is (line 85) | fn next_is(&mut self, token: Token) -> bool {
function expect (line 90) | fn expect(&mut self, expect: Token) -> Result<()> {
function skip (line 100) | fn skip(&mut self, token: Token) {
function peek (line 105) | fn peek(&mut self) -> Result<Option<&Token>> {
function parse_statement (line 110) | fn parse_statement(&mut self) -> Result<ast::Statement> {
function parse_begin (line 133) | fn parse_begin(&mut self) -> Result<ast::Statement> {
function parse_commit (line 160) | fn parse_commit(&mut self) -> Result<ast::Statement> {
function parse_rollback (line 166) | fn parse_rollback(&mut self) -> Result<ast::Statement> {
function parse_explain (line 172) | fn parse_explain(&mut self) -> Result<ast::Statement> {
function parse_create_table (line 181) | fn parse_create_table(&mut self) -> Result<ast::Statement> {
function parse_create_table_column (line 198) | fn parse_create_table_column(&mut self) -> Result<ast::Column> {
function parse_drop_table (line 247) | fn parse_drop_table(&mut self) -> Result<ast::Statement> {
function parse_delete (line 260) | fn parse_delete(&mut self) -> Result<ast::Statement> {
function parse_insert (line 268) | fn parse_insert(&mut self) -> Result<ast::Statement> {
function parse_update (line 308) | fn parse_update(&mut self) -> Result<ast::Statement> {
function parse_select (line 331) | fn parse_select(&mut self) -> Result<ast::Statement> {
function parse_select_clause (line 345) | fn parse_select_clause(&mut self) -> Result<Vec<(ast::Expression, Option...
function parse_from_clause (line 368) | fn parse_from_clause(&mut self) -> Result<Vec<ast::From>> {
function parse_from_table (line 394) | fn parse_from_table(&mut self) -> Result<ast::From> {
function parse_from_join (line 404) | fn parse_from_join(&mut self) -> Result<Option<ast::JoinType>> {
function parse_where_clause (line 430) | fn parse_where_clause(&mut self) -> Result<Option<ast::Expression>> {
function parse_group_by_clause (line 438) | fn parse_group_by_clause(&mut self) -> Result<Vec<ast::Expression>> {
function parse_having_clause (line 454) | fn parse_having_clause(&mut self) -> Result<Option<ast::Expression>> {
function parse_order_by_clause (line 462) | fn parse_order_by_clause(&mut self) -> Result<Vec<(ast::Expression, ast:...
function parse_limit_clause (line 486) | fn parse_limit_clause(&mut self) -> Result<Option<ast::Expression>> {
function parse_offset_clause (line 494) | fn parse_offset_clause(&mut self) -> Result<Option<ast::Expression>> {
function parse_expression (line 603) | fn parse_expression(&mut self) -> Result<ast::Expression> {
function parse_expression_at (line 608) | fn parse_expression_at(&mut self, min_precedence: Precedence) -> Result<...
function parse_expression_atom (line 652) | fn parse_expression_atom(&mut self) -> Result<ast::Expression> {
function parse_prefix_operator_at (line 700) | fn parse_prefix_operator_at(&mut self, min_precedence: Precedence) -> Op...
function parse_infix_operator_at (line 714) | fn parse_infix_operator_at(&mut self, min_precedence: Precedence) -> Opt...
function parse_postfix_operator_at (line 741) | fn parse_postfix_operator_at(
type Precedence (line 777) | type Precedence = u8;
type Output (line 786) | type Output = Self;
method add (line 788) | fn add(self, rhs: Associativity) -> Self {
type Associativity (line 780) | enum Associativity {
type PrefixOperator (line 799) | enum PrefixOperator {
method precedence (line 807) | fn precedence(&self) -> Precedence {
method associativity (line 816) | fn associativity(&self) -> Associativity {
method into_expression (line 821) | fn into_expression(self, rhs: ast::Expression) -> ast::Expression {
type InfixOperator (line 832) | enum InfixOperator {
method precedence (line 855) | fn precedence(&self) -> Precedence {
method associativity (line 872) | fn associativity(&self) -> Associativity {
method into_expression (line 880) | fn into_expression(self, lhs: ast::Expression, rhs: ast::Expression) -...
type PostfixOperator (line 903) | enum PostfixOperator {
method precedence (line 911) | fn precedence(&self) -> Precedence {
method into_expression (line 919) | fn into_expression(self, lhs: ast::Expression) -> ast::Expression {
FILE: src/sql/planner/optimizer.rs
type Optimizer (line 22) | pub trait Optimizer: Debug + Send + Sync {
method optimize (line 24) | fn optimize(&self, node: Node) -> Result<Node>;
method optimize (line 33) | fn optimize(&self, node: Node) -> Result<Node> {
method optimize (line 98) | fn optimize(&self, node: Node) -> Result<Node> {
method optimize (line 255) | fn optimize(&self, node: Node) -> Result<Node> {
method optimize (line 310) | fn optimize(&self, node: Node) -> Result<Node> {
method optimize (line 357) | fn optimize(&self, node: Node) -> Result<Node> {
type ConstantFolding (line 30) | pub struct ConstantFolding;
method fold (line 43) | pub fn fold(mut expr: Expression) -> Result<Expression> {
type FilterPushdown (line 95) | pub struct FilterPushdown;
method push_filters (line 106) | fn push_filters(mut node: Node) -> Node {
method push_into (line 114) | fn push_into(expr: Expression, target: &mut Node) -> Option<Expression> {
method maybe_push_filter (line 140) | fn maybe_push_filter(node: Node) -> Node {
method maybe_push_join (line 157) | fn maybe_push_join(node: Node) -> Node {
type IndexLookup (line 252) | pub struct IndexLookup;
method index_lookup (line 265) | fn index_lookup(mut node: Node) -> Node {
type HashJoin (line 307) | pub struct HashJoin;
method hash_join (line 317) | pub fn hash_join(node: Node) -> Node {
type ShortCircuit (line 354) | pub struct ShortCircuit;
method short_circuit (line 366) | fn short_circuit(mut node: Node) -> Node {
FILE: src/sql/planner/plan.rs
type Plan (line 44) | pub enum Plan {
method build (line 77) | pub fn build(statement: ast::Statement, catalog: &impl Catalog) -> Res...
method execute (line 82) | pub fn execute(self, txn: &impl Transaction) -> Result<ExecutionResult> {
method optimize (line 88) | pub fn optimize(self) -> Result<Self> {
type Node (line 108) | pub enum Node {
method columns (line 179) | pub fn columns(&self) -> usize {
method column_label (line 213) | pub fn column_label(&self, index: usize) -> Label {
method transform (line 271) | pub fn transform(
method transform_expressions (line 320) | pub fn transform_expressions(
method format (line 469) | pub fn format(
type Aggregate (line 376) | pub enum Aggregate {
method format (line 385) | fn format(&self, node: &Node) -> String {
method expr (line 396) | pub fn expr(&self) -> &Expression {
type Direction (line 409) | pub enum Direction {
method from (line 424) | fn from(dir: ast::Direction) -> Self {
method fmt (line 415) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
method fmt (line 434) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
method fmt (line 460) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
function invert_remap (line 649) | pub fn invert_remap(targets: &[Option<usize>]) -> Vec<Option<usize>> {
FILE: src/sql/planner/planner.rs
type Planner (line 18) | pub struct Planner<'a, C: Catalog> {
function new (line 24) | pub fn new(catalog: &'a C) -> Self {
function build (line 29) | pub fn build(&mut self, statement: ast::Statement) -> Result<Plan> {
function build_create_table (line 50) | fn build_create_table(&self, name: String, columns: Vec<ast::Column>) ->...
function build_drop_table (line 81) | fn build_drop_table(&self, name: String, if_exists: bool) -> Result<Plan> {
function build_delete (line 86) | fn build_delete(&self, table: String, r#where: Option<ast::Expression>) ...
function build_insert (line 98) | fn build_insert(
function build_update (line 128) | fn build_update(
function build_select (line 159) | fn build_select(
function build_from_clause (line 283) | fn build_from_clause(&self, from: Vec<ast::From>, scope: &mut Scope) -> ...
function build_from (line 306) | fn build_from(&self, from: ast::From, parent_scope: &mut Scope) -> Resul...
function build_aggregate (line 365) | fn build_aggregate(
function build_aggregate_function (line 391) | fn build_aggregate_function(expr: ast::Expression, scope: &Scope) -> Res...
function is_aggregate_function (line 417) | fn is_aggregate_function(expr: &ast::Expression) -> bool {
function collect_aggregates (line 425) | fn collect_aggregates(
function build_select_hidden (line 452) | fn build_select_hidden(
function build_expression (line 495) | pub fn build_expression(expr: ast::Expression, scope: &Scope) -> Result<...
function build_constant_value (line 572) | fn build_constant_value(expr: ast::Expression) -> Result<Value> {
type Scope (line 589) | pub struct Scope {
method new (line 614) | pub fn new() -> Self {
method from_table (line 619) | fn from_table(table: &Table) -> Result<Self> {
method spawn (line 626) | pub fn spawn(&self) -> Self {
method add_table (line 634) | fn add_table(&mut self, table: &Table, alias: Option<&str>) -> Result<...
method add_column (line 648) | fn add_column(&mut self, label: Label) -> usize {
method lookup_column (line 661) | fn lookup_column(&self, table: Option<&str>, name: &str) -> Result<usi...
method add_aggregate (line 692) | fn add_aggregate(&mut self, expr: &ast::Expression, parent: &Scope) ->...
method lookup_aggregate (line 712) | fn lookup_aggregate(&self, expr: &ast::Expression) -> Option<usize> {
method add_passthrough (line 718) | fn add_passthrough(&mut self, parent: &Scope, parent_index: usize, hid...
method merge (line 732) | fn merge(&mut self, scope: Scope) -> Result<()> {
method project (line 756) | fn project(&self, expressions: &[(ast::Expression, Option<String>)]) -...
method remap (line 781) | fn remap(&self, targets: &[Option<usize>]) -> Self {
method remove_hidden (line 791) | fn remove_hidden(&mut self) -> Option<HashSet<usize>> {
method remap_hidden (line 813) | fn remap_hidden(&mut self) -> Option<Vec<Option<usize>>> {
FILE: src/sql/types/expression.rs
type Expression (line 19) | pub enum Expression {
method display (line 69) | pub fn display<'a>(&'a self, node: &'a Node) -> ExpressionDisplay<'a> {
method evaluate (line 75) | pub fn evaluate(&self, row: Option<&Row>) -> Result<Value> {
method walk (line 212) | pub fn walk(&self, visitor: &mut impl FnMut(&Expression) -> bool) -> b...
method contains (line 244) | pub fn contains(&self, visitor: &impl Fn(&Expression) -> bool) -> bool {
method transform (line 250) | pub fn transform(
method into_cnf (line 292) | pub fn into_cnf(self) -> Self {
method into_cnf_vec (line 314) | pub fn into_cnf_vec(self) -> Vec<Self> {
method into_nnf (line 331) | pub fn into_nnf(self) -> Self {
method and_vec (line 354) | pub fn and_vec(exprs: Vec<Expression>) -> Option<Self> {
method is_column_lookup (line 365) | pub fn is_column_lookup(&self) -> Option<usize> {
method into_column_values (line 393) | pub fn into_column_values(self, index: usize) -> Vec<Value> {
method replace_column (line 424) | pub fn replace_column(self, from: usize, to: usize) -> Self {
method shift_column (line 433) | pub fn shift_column(self, diff: isize) -> Self {
method from (line 538) | fn from(value: Value) -> Self {
method fmt (line 446) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
type ExpressionDisplay (line 453) | pub struct ExpressionDisplay<'a> {
method fmt (line 460) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
function new (line 514) | pub fn new(expr: &'a Expression, node: &'a Node, parent_precedence: u8) ...
function precedence (line 519) | fn precedence(expr: &Expression) -> u8 {
function from (line 544) | fn from(value: Value) -> Self {
FILE: src/sql/types/schema.rs
type Table (line 17) | pub struct Table {
method validate (line 100) | pub fn validate(&self, catalog: &impl Catalog) -> Result<()> {
method validate_row (line 181) | pub fn validate_row(&self, row: &Row, update: bool, txn: &impl Transac...
type Column (line 31) | pub struct Column {
method fmt (line 57) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
FILE: src/sql/types/value.rs
type DataType (line 18) | pub enum DataType {
method fmt (line 30) | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
type Value (line 42) | pub enum Value {
method datatype (line 166) | pub fn datatype(&self) -> Option<DataType> {
method is_undefined (line 177) | pub fn is_undefined(&self) -> bool {
method checked_add (line 186) | pub fn checked_add(&self, other: &Self) -> Result<Self> {
method checked_div (line 204) | pub fn checked_div(&self, other: &Self) -> Result<Self> {
method checked_mul (line 220) | pub fn checked_mul(&self, other: &Self) -> Result<Self> {
method checked_pow (line 238) | pub fn checked_pow(&self, other: &Self) -> Result<Self> {
method checked_rem (line 264) | pub fn checked_rem(&self, other: &Self) -> Result<Self> {
method checked_sub (line 280) | pub fn checked_sub(&self, other: &Self) -> Result<Self> {
method from (line 299) | fn from(v: bool) -> Self {
method from (line 305) | fn from(v: f64) -> Self {
method from (line 311) | fn from(v: i64) -> Self {
method from (line 317) | fn from(v: String) -> Self {
method from (line 323) | fn from(v: &str) -> Self {
method fmt (line 69) | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
function serialize_f64 (line 83) | fn serialize_f64<S: Serializer>(value: &f64, serializer: S) -> StdResult...
method eq (line 93) | fn eq(&self, other: &Self) -> bool {
method hash (line 111) | fn hash<H: Hasher>(&self, hasher: &mut H) {
method cmp (line 135) | fn cmp(&self, other: &Self) -> Ordering {
method partial_cmp (line 159) | fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
type Error (line 329) | type Error = Error;
function try_from (line 331) | fn try_from(value: Value) -> Result<Self> {
type Error (line 340) | type Error = Error;
function try_from (line 342) | fn try_from(value: Value) -> Result<Self> {
type Error (line 351) | type Error = Error;
function try_from (line 353) | fn try_from(value: Value) -> Result<Self> {
type Error (line 362) | type Error = Error;
method try_from (line 364) | fn try_from(value: Value) -> Result<Self> {
function from (line 373) | fn from(v: &'a Value) -> Self {
type Row (line 379) | pub type Row = Vec<Value>;
type Rows (line 382) | pub type Rows = Box<dyn RowIterator>;
type RowIterator (line 388) | pub trait RowIterator: Iterator<Item = Result<Row>> + DynClone {}
type Label (line 396) | pub enum Label {
method as_header (line 417) | pub fn as_header(&self) -> &str {
method from (line 437) | fn from(name: Option<String>) -> Self {
method fmt (line 406) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
function from (line 427) | fn from(label: Label) -> Self {
FILE: src/storage/bitcask.rs
type BitCask (line 50) | pub struct BitCask {
method new (line 75) | pub fn new(path: PathBuf) -> Result<Self> {
method new_maybe_compact (line 84) | pub fn new_maybe_compact(
method compact (line 173) | pub fn compact(&mut self) -> Result<()> {
type KeyDir (line 58) | type KeyDir = BTreeMap<Vec<u8>, ValueLocation>;
type ValueLocation (line 62) | struct ValueLocation {
method end (line 68) | fn end(&self) -> u64 {
type ScanIterator (line 119) | type ScanIterator<'a> = ScanIterator<'a>;
method delete (line 121) | fn delete(&mut self, key: &[u8]) -> Result<()> {
method flush (line 127) | fn flush(&mut self) -> Result<()> {
method get (line 136) | fn get(&mut self, key: &[u8]) -> Result<Option<Vec<u8>>> {
method scan (line 143) | fn scan(&mut self, range: impl RangeBounds<Vec<u8>>) -> Self::ScanIterat...
method scan_dyn (line 147) | fn scan_dyn(
method set (line 154) | fn set(&mut self, key: &[u8], value: Vec<u8>) -> Result<()> {
method status (line 160) | fn status(&mut self) -> Result<Status> {
method drop (line 199) | fn drop(&mut self) {
type ScanIterator (line 206) | pub struct ScanIterator<'a> {
function map (line 212) | fn map(&mut self, item: (&Vec<u8>, &ValueLocation)) -> <Self as Iterator...
type Item (line 219) | type Item = Result<(Vec<u8>, Vec<u8>)>;
method next (line 221) | fn next(&mut self) -> Option<Self::Item> {
method next_back (line 227) | fn next_back(&mut self) -> Option<Self::Item> {
type Log (line 239) | struct Log {
method new (line 250) | fn new(path: PathBuf) -> Result<Self> {
method build_keydir (line 269) | fn build_keydir(&mut self) -> Result<KeyDir> {
method read_value (line 334) | fn read_value(&mut self, location: ValueLocation) -> Result<Vec<u8>> {
method write_entry (line 344) | fn write_entry(&mut self, key: &[u8], value: Option<&[u8]>) -> Result<...
function test_goldenscript (line 387) | fn test_goldenscript(path: &std::path::Path) {
function lock (line 394) | fn lock() -> Result<()> {
function recovery (line 410) | fn recovery() -> Result<()> {
function point_ops_sizes (line 460) | fn point_ops_sizes() -> Result<()> {
type BitCaskRunner (line 480) | struct BitCaskRunner {
method run (line 486) | fn run(&mut self, command: &goldenscript::Command) -> StdResult<String...
method new (line 531) | fn new() -> Self {
method dump (line 539) | fn dump(&mut self, output: &mut String) -> StdResult<(), Box<dyn StdEr...
FILE: src/storage/engine.rs
type Engine (line 22) | pub trait Engine: Send {
method delete (line 29) | fn delete(&mut self, key: &[u8]) -> Result<()>;
method flush (line 32) | fn flush(&mut self) -> Result<()>;
method get (line 35) | fn get(&mut self, key: &[u8]) -> Result<Option<Vec<u8>>>;
method scan (line 38) | fn scan(&mut self, range: impl RangeBounds<Vec<u8>>) -> Self::ScanIter...
method scan_dyn (line 43) | fn scan_dyn(&mut self, range: (Bound<Vec<u8>>, Bound<Vec<u8>>)) -> Box...
method scan_prefix (line 46) | fn scan_prefix(&mut self, prefix: &[u8]) -> Self::ScanIterator<'_>
method set (line 54) | fn set(&mut self, key: &[u8], value: Vec<u8>) -> Result<()>;
method status (line 57) | fn status(&mut self) -> Result<Status>;
type ScanIterator (line 258) | type ScanIterator<'a>
method flush (line 263) | fn flush(&mut self) -> Result<()> {
method delete (line 269) | fn delete(&mut self, key: &[u8]) -> Result<()> {
method get (line 275) | fn get(&mut self, key: &[u8]) -> Result<Option<Vec<u8>>> {
method scan (line 279) | fn scan(&mut self, range: impl RangeBounds<Vec<u8>>) -> Self::ScanIter...
method scan_dyn (line 283) | fn scan_dyn(
method set (line 290) | fn set(&mut self, key: &[u8], value: Vec<u8>) -> Result<()> {
method status (line 296) | fn status(&mut self) -> Result<Status> {
type ScanIterator (line 316) | type ScanIterator<'a>
method delete (line 323) | fn delete(&mut self, key: &[u8]) -> Result<()> {
method flush (line 328) | fn flush(&mut self) -> Result<()> {
method get (line 333) | fn get(&mut self, key: &[u8]) -> Result<Option<Vec<u8>>> {
method scan (line 340) | fn scan(&mut self, range: impl RangeBounds<Vec<u8>>) -> Self::ScanIter...
method scan_dyn (line 349) | fn scan_dyn(
method set (line 358) | fn set(&mut self, key: &[u8], value: Vec<u8>) -> Result<()> {
method status (line 363) | fn status(&mut self) -> Result<Status> {
type ScanIterator (line 61) | pub trait ScanIterator: DoubleEndedIterator<Item = Result<(Vec<u8>, Vec<...
type Status (line 68) | pub struct Status {
method garbage_disk_size (line 83) | pub fn garbage_disk_size(&self) -> u64 {
method garbage_disk_percent (line 88) | pub fn garbage_disk_percent(&self) -> f64 {
type Runner (line 114) | pub struct Runner<E: Engine> {
function new (line 119) | pub fn new(engine: E) -> Self {
function run (line 125) | fn run(&mut self, command: &goldenscript::Command) -> StdResult<String, ...
function decode_binary (line 204) | pub fn decode_binary(s: &str) -> Vec<u8> {
function parse_key_range (line 218) | pub fn parse_key_range(s: &str) -> StdResult<impl RangeBounds<Vec<u8>>, ...
type Emit (line 237) | pub struct Emit<E: Engine> {
type Operation (line 245) | pub enum Operation {
function new (line 252) | pub fn new(inner: E, tx: Sender<Operation>) -> Self {
type Mirror (line 304) | pub struct Mirror<A: Engine, B: Engine> {
function new (line 310) | pub fn new(a: A, b: B) -> Self {
type MirrorIterator (line 373) | pub struct MirrorIterator<'a, A: Engine + 'a, B: Engine + 'a> {
type Item (line 379) | type Item = Result<(Vec<u8>, Vec<u8>)>;
method next (line 381) | fn next(&mut self) -> Option<Self::Item> {
method next_back (line 390) | fn next_back(&mut self) -> Option<Self::Item> {
FILE: src/storage/memory.rs
type Memory (line 11) | pub struct Memory(BTreeMap<Vec<u8>, Vec<u8>>);
method new (line 15) | pub fn new() -> Self {
type ScanIterator (line 21) | type ScanIterator<'a> = ScanIterator<'a>;
method delete (line 23) | fn delete(&mut self, key: &[u8]) -> Result<()> {
method flush (line 28) | fn flush(&mut self) -> Result<()> {
method get (line 32) | fn get(&mut self, key: &[u8]) -> Result<Option<Vec<u8>>> {
method scan (line 36) | fn scan(&mut self, range: impl RangeBounds<Vec<u8>>) -> Self::ScanIterat...
method scan_dyn (line 40) | fn scan_dyn(
method set (line 47) | fn set(&mut self, key: &[u8], value: Vec<u8>) -> Result<()> {
method status (line 52) | fn status(&mut self) -> Result<Status> {
type ScanIterator (line 63) | pub struct ScanIterator<'a>(Range<'a, Vec<u8>, Vec<u8>>);
type Item (line 66) | type Item = Result<(Vec<u8>, Vec<u8>)>;
method next (line 68) | fn next(&mut self) -> Option<Self::Item> {
method next_back (line 74) | fn next_back(&mut self) -> Option<Self::Item> {
function test_goldenscript (line 95) | fn test_goldenscript(path: &Path) {
FILE: src/storage/mvcc.rs
type Version (line 158) | pub type Version = u64;
type Key (line 167) | pub enum Key<'a> {
type KeyPrefix (line 205) | enum KeyPrefix<'a> {
type MVCC (line 229) | pub struct MVCC<E: Engine> {
function new (line 235) | pub fn new(engine: E) -> Self {
function begin (line 240) | pub fn begin(&self) -> Result<Transaction<E>> {
function begin_read_only (line 245) | pub fn begin_read_only(&self) -> Result<Transaction<E>> {
function begin_as_of (line 250) | pub fn begin_as_of(&self, version: Version) -> Result<Transaction<E>> {
function resume (line 255) | pub fn resume(&self, state: TransactionState) -> Result<Transaction<E>> {
function get_unversioned (line 260) | pub fn get_unversioned(&self, key: &[u8]) -> Result<Option<Vec<u8>>> {
function set_unversioned (line 265) | pub fn set_unversioned(&self, key: &[u8], value: Vec<u8>) -> Result<()> {
function status (line 270) | pub fn status(&self) -> Result<Status> {
type Status (line 283) | pub struct Status {
type Transaction (line 295) | pub struct Transaction<E: Engine> {
type TransactionState (line 315) | pub struct TransactionState {
method is_visible (line 344) | fn is_visible(&self, version: Version) -> bool {
function from (line 356) | fn from(txn: TransactionState) -> Self {
function from (line 362) | fn from(txn: &'a TransactionState) -> Self {
function begin (line 371) | fn begin(engine: Arc<Mutex<E>>) -> Result<Self> {
function begin_read_only (line 397) | fn begin_read_only(engine: Arc<Mutex<E>>, as_of: Option<Version>) -> Res...
function resume (line 428) | fn resume(engine: Arc<Mutex<E>>, s: TransactionState) -> Result<Self> {
function scan_active (line 438) | fn scan_active(session: &mut MutexGuard<E>) -> Result<BTreeSet<Version>> {
function version (line 451) | pub fn version(&self) -> Version {
function read_only (line 456) | pub fn read_only(&self) -> bool {
function state (line 462) | pub fn state(&self) -> &TransactionState {
function commit (line 472) | pub fn commit(self) -> Result<()> {
function rollback (line 490) | pub fn rollback(self) -> Result<()> {
function delete (line 515) | pub fn delete(&self, key: &[u8]) -> Result<()> {
function set (line 520) | pub fn set(&self, key: &[u8], value: Vec<u8>) -> Result<()> {
function write_version (line 528) | fn write_version(&self, key: &[u8], value: Option<Vec<u8>>) -> Result<()> {
function get (line 565) | pub fn get(&self, key: &[u8]) -> Result<Option<Vec<u8>>> {
function scan (line 585) | pub fn scan(&self, range: impl RangeBounds<Vec<u8>>) -> ScanIterator<E> {
function scan_prefix (line 600) | pub fn scan_prefix(&self, prefix: &[u8]) -> ScanIterator<E> {
type ScanIterator (line 620) | pub struct ScanIterator<E: Engine> {
method clone (line 635) | fn clone(&self) -> Self {
constant BUFFER_SIZE (line 648) | const BUFFER_SIZE: usize = if cfg!(test) { 2 } else { 32 };
function new (line 651) | fn new(
function fill_buffer (line 661) | fn fill_buffer(&mut self) -> Result<()> {
type Item (line 702) | type Item = Result<(Vec<u8>, Vec<u8>)>;
method next (line 704) | fn next(&mut self) -> Option<Self::Item> {
type VersionIterator (line 716) | struct VersionIterator<'a, I: engine::ScanIterator> {
function new (line 725) | fn new(txn: &'a TransactionState, inner: I) -> Self {
function try_next (line 730) | fn try_next(&mut self) -> Result<Option<(Vec<u8>, Version, Vec<u8>)>> {
type Item (line 745) | type Item = Result<(Vec<u8>, Version, Vec<u8>)>;
method next (line 747) | fn next(&mut self) -> Option<Self::Item> {
function test_goldenscript (line 774) | fn test_goldenscript(path: &Path) {
function key_prefix (line 785) | fn key_prefix(prefix: KeyPrefix, key: Key) {
type MVCCRunner (line 792) | pub struct MVCCRunner {
method new (line 802) | fn new() -> Self {
method get_txn (line 815) | fn get_txn(
method txn_name (line 824) | fn txn_name(prefix: &Option<String>) -> Result<&str, Box<dyn Error>> {
method no_txn (line 829) | fn no_txn(command: &goldenscript::Command) -> Result<(), Box<dyn Error...
method run (line 838) | fn run(&mut self, command: &goldenscript::Command) -> Result<String, B...
method end_command (line 1067) | fn end_command(&mut self, _: &goldenscript::Command) -> Result<String,...
type TestEngine (line 799) | type TestEngine = Emit<Mirror<BitCask, Memory>>;
FILE: tests/testcluster.rs
constant TIMEOUT (line 13) | const TIMEOUT: Duration = Duration::from_secs(5);
constant SQL_BASE_PORT (line 16) | const SQL_BASE_PORT: u16 = 19600;
constant RAFT_BASE_PORT (line 19) | const RAFT_BASE_PORT: u16 = 19700;
type TestCluster (line 29) | pub struct TestCluster {
method run (line 39) | pub fn run(nodes: u8) -> Result<Self, Box<dyn Error>> {
method connect (line 73) | pub fn connect(&self) -> Result<Client, Box<dyn Error>> {
type NodePorts (line 35) | type NodePorts = BTreeMap<NodeID, (u16, u16)>;
type TestServer (line 80) | pub struct TestServer {
method run (line 88) | fn run(id: NodeID, dir: &Path, ports: &NodePorts) -> Result<Self, Box<...
method build_config (line 112) | fn build_config(id: NodeID, dir: &Path, ports: &NodePorts) -> Result<S...
method assert_alive (line 131) | fn assert_alive(&mut self) {
method connect (line 138) | fn connect(&self) -> Result<Client, Box<dyn Error>> {
method drop (line 145) | fn drop(&mut self) {
FILE: tests/tests.rs
function test_goldenscript (line 27) | fn test_goldenscript(path: &Path) {
type Runner (line 40) | struct Runner {
method new (line 46) | fn new() -> Self {
method get_client (line 51) | fn get_client(&mut self, prefix: &Option<String>) -> Result<&mut Clien...
method client_name (line 64) | fn client_name(prefix: &Option<String>) -> &str {
method run (line 71) | fn run(&mut self, command: &goldenscript::Command) -> Result<String, B...
Condensed preview — 284 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (1,580K chars).
[
{
"path": ".github/workflows/ci.yml",
"chars": 704,
"preview": "name: CI\non: [push, pull_request, workflow_dispatch]\npermissions:\n contents: read\n\njobs:\n test:\n name: Test\n run"
},
{
"path": ".gitignore",
"chars": 84,
"preview": "/cluster/toydb*/data\n/data\n/docs/crate/target\n/target\n.DS_Store\n.vscode/\n**/*.rs.bk\n"
},
{
"path": "Cargo.toml",
"chars": 1060,
"preview": "[package]\nname = \"toydb\"\nversion = \"1.0.0\"\ndescription = \"A simple distributed SQL database, built for education\"\nauthor"
},
{
"path": "LICENSE",
"chars": 11356,
"preview": " Apache License\n Version 2.0, January 2004\n "
},
{
"path": "README.md",
"chars": 8443,
"preview": "# <a><img src=\"./docs/architecture/images/toydb.svg\" height=\"40\" valign=\"top\" /></a> toyDB\n\nDistributed SQL database in "
},
{
"path": "cluster/run.sh",
"chars": 939,
"preview": "#!/usr/bin/env bash\n#\n# This script builds and runs a 5-node toyDB cluster listening on ports\n# 9601-9605. Config and da"
},
{
"path": "cluster/toydb1/toydb.yaml",
"chars": 177,
"preview": "id: 1\ndata_dir: toydb1/data\nlisten_sql: localhost:9601\nlisten_raft: localhost:9701\npeers:\n '2': localhost:9702\n '3': l"
},
{
"path": "cluster/toydb2/toydb.yaml",
"chars": 177,
"preview": "id: 2\ndata_dir: toydb2/data\nlisten_sql: localhost:9602\nlisten_raft: localhost:9702\npeers:\n '1': localhost:9701\n '3': l"
},
{
"path": "cluster/toydb3/toydb.yaml",
"chars": 177,
"preview": "id: 3\ndata_dir: toydb3/data\nlisten_sql: localhost:9603\nlisten_raft: localhost:9703\npeers:\n '1': localhost:9701\n '2': l"
},
{
"path": "cluster/toydb4/toydb.yaml",
"chars": 177,
"preview": "id: 4\ndata_dir: toydb4/data\nlisten_sql: localhost:9604\nlisten_raft: localhost:9704\npeers:\n '1': localhost:9701\n '2': l"
},
{
"path": "cluster/toydb5/toydb.yaml",
"chars": 177,
"preview": "id: 5\ndata_dir: toydb5/data\nlisten_sql: localhost:9605\nlisten_raft: localhost:9705\npeers:\n '1': localhost:9701\n '2': l"
},
{
"path": "config/toydb.yaml",
"chars": 1112,
"preview": "# The node ID (must be unique in the cluster), and map of peer IDs and Raft\n# addresses (empty for single node).\nid: 1\np"
},
{
"path": "docs/architecture/README.md",
"chars": 27,
"preview": "See [`index.md`](index.md)."
},
{
"path": "docs/architecture/client.md",
"chars": 2876,
"preview": "# Client\n\nThe toyDB client is in the [`client`](https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a817"
},
{
"path": "docs/architecture/encoding.md",
"chars": 7565,
"preview": "# Key/Value Encoding\n\nThe key/value store uses binary `Vec<u8>` keys and values, so we need an encoding scheme to \ntrans"
},
{
"path": "docs/architecture/index.md",
"chars": 3203,
"preview": "# toyDB Architecture\n\ntoyDB is a simple distributed SQL database, intended to illustrate how such systems are built. The"
},
{
"path": "docs/architecture/mvcc.md",
"chars": 9611,
"preview": "# MVCC Transactions\n\nTransactions are groups of reads and writes (e.g. to different keys) that are submitted together as"
},
{
"path": "docs/architecture/overview.md",
"chars": 2677,
"preview": "# Overview\n\ntoyDB consists of a cluster of nodes that execute [SQL](https://en.wikipedia.org/wiki/SQL)\ntransactions agai"
},
{
"path": "docs/architecture/raft.md",
"chars": 25854,
"preview": "# Raft Consensus\n\n[Raft](https://raft.github.io) is a distributed consensus protocol which replicates data across a\nclus"
},
{
"path": "docs/architecture/server.md",
"chars": 5939,
"preview": "# Server\n\nNow that we've gone over the individual components, we'll tie them all together in the toyDB\nserver `toydb::Se"
},
{
"path": "docs/architecture/sql-data.md",
"chars": 6220,
"preview": "# SQL Data Model\n\nThe SQL data model represents user data in tables and rows. It is made up of data types and schemas,\ni"
},
{
"path": "docs/architecture/sql-execution.md",
"chars": 6018,
"preview": "# SQL Execution\n\nNow that the planner and optimizer have done all the hard work of figuring out how to execute a\nquery, "
},
{
"path": "docs/architecture/sql-optimizer.md",
"chars": 14117,
"preview": "# SQL Optimization\n\n[Query optimization](https://en.wikipedia.org/wiki/Query_optimization) attempts to improve query\nper"
},
{
"path": "docs/architecture/sql-parser.md",
"chars": 9237,
"preview": "# SQL Parsing\n\nWe finally arrive at SQL. The SQL parser is the first stage in processing SQL queries and\nstatements, loc"
},
{
"path": "docs/architecture/sql-planner.md",
"chars": 11715,
"preview": "# SQL Planning\n\nThe SQL planner in the [`sql::planner`](https://github.com/erikgrinaker/toydb/tree/c64012e29c5712d6fe028"
},
{
"path": "docs/architecture/sql-raft.md",
"chars": 5163,
"preview": "# SQL Raft Replication\n\ntoyDB uses Raft to replicate SQL storage across a cluster of nodes (see the Raft section for\ndet"
},
{
"path": "docs/architecture/sql-storage.md",
"chars": 9491,
"preview": "# SQL Storage\n\nThe SQL storage engine, in the [`sql::engine`](https://github.com/erikgrinaker/toydb/tree/213e5c02b09f1a3"
},
{
"path": "docs/architecture/sql.md",
"chars": 1107,
"preview": "# SQL Engine\n\nThe SQL engine provides support for the SQL query language, and is the main database interface. It\nuses a "
},
{
"path": "docs/architecture/storage.md",
"chars": 5429,
"preview": "# Storage Engine\n\ntoyDB uses an embedded [key/value store](https://en.wikipedia.org/wiki/Key–value_database) for data\nst"
},
{
"path": "docs/architecture.md",
"chars": 58,
"preview": "Moved to [`architecture/index.md`](architecture/index.md)."
},
{
"path": "docs/crate/Cargo.toml",
"chars": 306,
"preview": "[package]\nname = \"toydb\"\nversion = \"1.0.1\"\ndescription = \"A simple distributed SQL database, built for education\"\nauthor"
},
{
"path": "docs/crate/README.md",
"chars": 687,
"preview": "# toyDB\n\ntoyDB is a distributed SQL database in Rust, built from scratch as an educational project. Main\nfeatures:\n\n* Ra"
},
{
"path": "docs/crate/src/lib.rs",
"chars": 186,
"preview": "//! This crate is just a simple README.md placeholder. toydb is not intended to be used as a\n//! library, and is not dis"
},
{
"path": "docs/examples.md",
"chars": 17047,
"preview": "# SQL Examples\n\nThe following examples demonstrate some of toyDB's SQL features. For more details, see the\n[SQL referenc"
},
{
"path": "docs/references.md",
"chars": 3301,
"preview": "# References\n\nThis is the main research material I used while building toyDB. It is a subset of my\n[reading list](https:"
},
{
"path": "docs/sql.md",
"chars": 17226,
"preview": "# SQL Reference\n\n## Data Types\n\nThe following data types are supported:\n\n* `BOOLEAN` (`BOOL`): logical truth values, i.e"
},
{
"path": "docs/tools/update-links.py",
"chars": 1918,
"preview": "#!/usr/bin/env python3\n#\n# Updates GitHub code links to the latest commit SHA.\n\nimport os, re, sys, argparse\nimport requ"
},
{
"path": "rust-toolchain",
"chars": 6,
"preview": "1.93.1"
},
{
"path": "rustfmt.toml",
"chars": 29,
"preview": "use_small_heuristics = \"Max\"\n"
},
{
"path": "src/bin/toydb.rs",
"chars": 5069,
"preview": "//! The toyDB server. Takes configuration from a config file (default\n//! config/toydb.yaml) or corresponding TOYDB_ env"
},
{
"path": "src/bin/toydump.rs",
"chars": 1543,
"preview": "//! toydump is a debug tool that prints a toyDB BitCask database in\n//! human-readable form. It can print both the SQL d"
},
{
"path": "src/bin/toysql.rs",
"chars": 11032,
"preview": "//! toySQL is a command-line client for toyDB. It connects to a toyDB node\n//! (default localhost:9601) and executes SQL"
},
{
"path": "src/bin/workload.rs",
"chars": 18577,
"preview": "//! Runs toyDB workload benchmarks. By default, it assumes a running 5-node\n//! cluster as launched via cluster/run.sh, "
},
{
"path": "src/client.rs",
"chars": 4563,
"preview": "use std::io::{BufReader, BufWriter, Write as _};\nuse std::net::{TcpStream, ToSocketAddrs};\nuse std::time::Duration;\n\nuse"
},
{
"path": "src/encoding/bincode.rs",
"chars": 2070,
"preview": "//! Bincode is used to encode values, both in key/value stores and the toyDB\n//! network protocol. It is a Rust-specific"
},
{
"path": "src/encoding/format.rs",
"chars": 9850,
"preview": "//! Formats raw keys and values, recursively where necessary. Handles both both\n//! Raft, MVCC, SQL, and raw binary data"
},
{
"path": "src/encoding/keycode.rs",
"chars": 24555,
"preview": "//! Keycode is a lexicographical order-preserving binary encoding for use with\n//! keys in key/value stores. It is desig"
},
{
"path": "src/encoding/mod.rs",
"chars": 2558,
"preview": "//! Binary data encodings.\n//!\n//! * keycode: used for keys in the key/value store.\n//! * bincode: used for values in th"
},
{
"path": "src/error.rs",
"chars": 6921,
"preview": "use std::fmt::Display;\n\nuse serde::{Deserialize, Serialize};\n\n/// toyDB errors.\n#[derive(Clone, Debug, PartialEq, Serial"
},
{
"path": "src/lib.rs",
"chars": 331,
"preview": "#![warn(clippy::all)]\n#![allow(clippy::large_enum_variant)]\n#![allow(clippy::module_inception)]\n#![allow(clippy::type_co"
},
{
"path": "src/raft/log.rs",
"chars": 27981,
"preview": "use std::ops::{Bound, RangeBounds};\n\nuse serde::{Deserialize, Serialize};\n\nuse super::{NodeID, Term};\nuse crate::encodin"
},
{
"path": "src/raft/message.rs",
"chars": 8402,
"preview": "use std::collections::BTreeMap;\n\nuse serde::{Deserialize, Serialize};\n\nuse super::{Entry, Index, NodeID, Term};\nuse crat"
},
{
"path": "src/raft/mod.rs",
"chars": 14390,
"preview": "//! Implements the Raft distributed consensus protocol.\n//!\n//! For details, see Diego Ongaro's original writings:\n//!\n/"
},
{
"path": "src/raft/node.rs",
"chars": 92618,
"preview": "use std::cmp::{max, min};\nuse std::collections::{HashMap, HashSet, VecDeque};\nuse std::ops::Range;\n\nuse crossbeam::chann"
},
{
"path": "src/raft/state.rs",
"chars": 7504,
"preview": "use super::{Entry, Index};\nuse crate::error::Result;\n\n/// A Raft-managed state machine. Raft itself does not care what t"
},
{
"path": "src/raft/testscripts/log/append",
"chars": 1721,
"preview": "# Appending an entry with term 0 fails.\n!append foo\n---\nPanic: can't append entry in term 0\n\n# Appending to an empty log"
},
{
"path": "src/raft/testscripts/log/commit",
"chars": 1920,
"preview": "# Committing fails on an empty engine.\n!commit 1\n---\nPanic: commit index 1 does not exist\n\n# Add some entries.\nset_term "
},
{
"path": "src/raft/testscripts/log/get",
"chars": 350,
"preview": "# get returns None on an empty engine.\nget 1\n---\nNone\n\n# Append a few entries.\nset_term 1\nappend\nappend foo\nset_term 2\na"
},
{
"path": "src/raft/testscripts/log/has",
"chars": 452,
"preview": "# has returns false on an empty engine.\nhas 1@1\n---\nfalse\n\n# Append a few entries.\nset_term 1\nappend\nappend foo\nset_term"
},
{
"path": "src/raft/testscripts/log/init",
"chars": 331,
"preview": "# Tests that the log correctly initializes cached state when opened.\n\nset_term 1\n---\nok\n\nappend foo\nset_term 2 7\nappend "
},
{
"path": "src/raft/testscripts/log/scan",
"chars": 835,
"preview": "# scan works on an empty engine, even when given indexes.\nscan\nscan 3..7\n---\nok\n\n# Append a few entries.\nset_term 1\nappe"
},
{
"path": "src/raft/testscripts/log/scan_apply",
"chars": 846,
"preview": "# scan_apply works on an empty engine, even when given an applied index.\nscan_apply 0\nscan_apply 3\n---\nok\n\n# Append a fe"
},
{
"path": "src/raft/testscripts/log/splice",
"chars": 5173,
"preview": "# Splicing at index 0 should fail.\n!splice 0@1=foo\n---\nPanic: spliced entry has index or term 0\n\n# Splicing without a te"
},
{
"path": "src/raft/testscripts/log/status",
"chars": 556,
"preview": "# Status on empty engine works.\nstatus engine=true\n---\nterm=0 last=0@0 commit=0@0 vote=None engine=Status {\n name: \"b"
},
{
"path": "src/raft/testscripts/log/term",
"chars": 1177,
"preview": "# get_term works on empty engine.\nget_term\n---\nterm=0 vote=None\n\n# Storing a 0 term errors.\n!set_term 0\n---\nPanic: can't"
},
{
"path": "src/raft/testscripts/node/append",
"chars": 1125,
"preview": "# Can append single entries in steady state.\n\ncluster nodes=3 leader=1\n---\nn1@1 leader last=1@1 commit=1@1 applied=1 pro"
},
{
"path": "src/raft/testscripts/node/append_base_missing",
"chars": 1995,
"preview": "# Appends with a base beyond the node's last log entry should result in a\n# rejection at the index following the last en"
},
{
"path": "src/raft/testscripts/node/append_base_missing_all",
"chars": 1985,
"preview": "# Appends to a node with an empty log should result in a rejection of index 1,\n# allowing the leader to send the entire "
},
{
"path": "src/raft/testscripts/node/append_commit_quorum",
"chars": 7606,
"preview": "# Append results in a leader-side commit once a quorum is reached for the\n# relevant entries.\n\ncluster nodes=6 leader=1\n"
},
{
"path": "src/raft/testscripts/node/append_initial",
"chars": 2360,
"preview": "# An initial append at base 0 can have a single or multiple entries.\n\ncluster nodes=3\n---\nn1@0 follower() last=0@0 commi"
},
{
"path": "src/raft/testscripts/node/append_max_entries",
"chars": 1722,
"preview": "# Large appends are limited to MAX_APPEND_ENTRIES, and each successful append\n# triggers the next append batch.\n\ncluster"
},
{
"path": "src/raft/testscripts/node/append_pipeline",
"chars": 2379,
"preview": "# Multiple appends are pipelined before acks are received, without\n# retransmitting the unacked entries.\n\ncluster nodes="
},
{
"path": "src/raft/testscripts/node/append_probe_divergent_first",
"chars": 6783,
"preview": "# Appends to a previous leader and follower with a divergent tail all\n# the way back to the first entry works.\n\ncluster "
},
{
"path": "src/raft/testscripts/node/append_probe_divergent_long",
"chars": 7811,
"preview": "# Appends to a previous leader and follower with a long divergent tail requires\n# the leader to repeatedly probe until i"
},
{
"path": "src/raft/testscripts/node/append_probe_divergent_short",
"chars": 5734,
"preview": "# Appends to a previous leader and follower with a shorter divergent tail skips\n# the missing entries before probing.\n\nc"
},
{
"path": "src/raft/testscripts/node/append_probe_divergent_single",
"chars": 3429,
"preview": "# An append replaces a conflict at the tail for a single term.\n\ncluster nodes=5 leader=1\n---\nn1@1 leader last=1@1 commit"
},
{
"path": "src/raft/testscripts/node/append_response_beyond_last_index_panics",
"chars": 661,
"preview": "# A successful AppendResponse with last index beyond leader's last log\n# should panic.\n\ncluster nodes=3 leader=1\n---\nn1@"
},
{
"path": "src/raft/testscripts/node/append_response_stale_reject",
"chars": 1880,
"preview": "# A successful AppendResponse with a reject_index below the match index\n# should be ignored.\n\ncluster nodes=3 leader=1\n-"
},
{
"path": "src/raft/testscripts/node/election",
"chars": 1922,
"preview": "# A node campaigns and wins leadership once the election timeout passes. Uses\n# ticks directly to also test tick handlin"
},
{
"path": "src/raft/testscripts/node/election_candidate_behind_leader",
"chars": 4267,
"preview": "# A candidate that lags behind the leader can still win the election\n# as long as it isn't behind the quorum.\n\ncluster n"
},
{
"path": "src/raft/testscripts/node/election_candidate_behind_quorum",
"chars": 2310,
"preview": "# A candidate that lags behind the quorum can't win an election.\n\ncluster nodes=5 leader=1\n---\nn1@1 leader last=1@1 comm"
},
{
"path": "src/raft/testscripts/node/election_contested",
"chars": 2731,
"preview": "# A leader can be elected even when there are multiple candidates.\n\ncluster nodes=5 election_timeout=2\n---\nn1@0 follower"
},
{
"path": "src/raft/testscripts/node/election_tie",
"chars": 2089,
"preview": "# No leader can be elected with an election tie.\n\ncluster nodes=3 election_timeout=2\n---\nn1@0 follower() last=0@0 commit"
},
{
"path": "src/raft/testscripts/node/election_tie_even",
"chars": 2695,
"preview": "# No leader can be elected with an election tie between an even number of nodes.\n\ncluster nodes=4 election_timeout=2\n---"
},
{
"path": "src/raft/testscripts/node/heartbeat_commits_follower",
"chars": 1561,
"preview": "# A heartbeat will commit and apply an entry on a follower.\n\ncluster nodes=3 leader=1\n---\nn1@1 leader last=1@1 commit=1@"
},
{
"path": "src/raft/testscripts/node/heartbeat_converts_candidate",
"chars": 1615,
"preview": "# A heartbeat from a leader should convert a candidate in the same term to a\n# follower.\n\ncluster nodes=3\n---\nn1@0 follo"
},
{
"path": "src/raft/testscripts/node/heartbeat_converts_follower",
"chars": 1626,
"preview": "# A heartbeat from a leader should convert a follower of a different leader in a\n# past term to a follower.\n\ncluster nod"
},
{
"path": "src/raft/testscripts/node/heartbeat_converts_follower_leaderless",
"chars": 1205,
"preview": "# A heartbeat from a leader should convert a leaderless follower.\n\ncluster nodes=3\n---\nn1@0 follower() last=0@0 commit=0"
},
{
"path": "src/raft/testscripts/node/heartbeat_converts_leader",
"chars": 1613,
"preview": "# A heartbeat from a leader should convert a leader in a past term to a\n# follower.\n\ncluster nodes=3 leader=3\n---\nn1@1 f"
},
{
"path": "src/raft/testscripts/node/heartbeat_lost_append_duplicate",
"chars": 2462,
"preview": "# Duplicate heartbeats and responses with a lost append will\n# trigger duplicate resends, but it will eventually resolve"
},
{
"path": "src/raft/testscripts/node/heartbeat_lost_append_multiple",
"chars": 2824,
"preview": "# A heartbeat response triggers a probe and resend of lost appends.\n\ncluster nodes=3 leader=1\n---\nn1@1 leader last=1@1 c"
},
{
"path": "src/raft/testscripts/node/heartbeat_lost_append_single",
"chars": 1657,
"preview": "# A heartbeat response triggers a resend of a lost append.\n\ncluster nodes=3 leader=1\n---\nn1@1 leader last=1@1 commit=1@1"
},
{
"path": "src/raft/testscripts/node/heartbeat_lost_read",
"chars": 1509,
"preview": "# Heartbeats will recover from a lost read message.\n\ncluster nodes=5 leader=1\n---\nn1@1 leader last=1@1 commit=1@1 applie"
},
{
"path": "src/raft/testscripts/node/heartbeat_match_commits",
"chars": 1962,
"preview": "# A heartbeat response can advance a follower match index and commit+apply.\n\ncluster nodes=3 leader=1\n---\nn1@1 leader la"
},
{
"path": "src/raft/testscripts/node/heartbeat_multiple_leaders_panic",
"chars": 732,
"preview": "# A heartbeat will panic if there are multiple leaders in a term.\n\ncluster nodes=3 leader=1\n---\nn1@1 leader last=1@1 com"
},
{
"path": "src/raft/testscripts/node/heartbeat_old_commit_index",
"chars": 737,
"preview": "# A heartbeat with an old commit index is ignored by a follower.\n\ncluster nodes=3 leader=1\n---\nn1@1 leader last=1@1 comm"
},
{
"path": "src/raft/testscripts/node/heartbeat_old_last_index",
"chars": 733,
"preview": "# A heartbeat with an old last index is matched by a follower.\n\ncluster nodes=3 leader=1\n---\nn1@1 leader last=1@1 commit"
},
{
"path": "src/raft/testscripts/node/heartbeat_probe_divergent",
"chars": 5188,
"preview": "# A heartbeat while the leader is probing a follower with a long divergent tail\n# doesn't disrupt the probing, and won't"
},
{
"path": "src/raft/testscripts/node/old_campaign_rejected",
"chars": 1719,
"preview": "# Old campaign messages (in the same term) are ignored by leaders and followers\n# once a leader is elected.\n\ncluster nod"
},
{
"path": "src/raft/testscripts/node/old_campaign_response_ignored",
"chars": 3321,
"preview": "# Old campaign responses (in the same term) are ignored by leaders and followers\n# once a leader is elected.\n\ncluster no"
},
{
"path": "src/raft/testscripts/node/old_heartbeat_ignored",
"chars": 1018,
"preview": "# A heartbeat from an old leader should be ignored.\n\n# Make n3 leader.\ncluster nodes=3 leader=3\n---\nn1@1 follower(n3) la"
},
{
"path": "src/raft/testscripts/node/request_candidate_abort",
"chars": 751,
"preview": "# Client read/write requests fail on candidates.\n\ncluster nodes=3\n---\nn1@0 follower() last=0@0 commit=0@0 applied=0\nn2@0"
},
{
"path": "src/raft/testscripts/node/request_follower",
"chars": 1603,
"preview": "# Client read/write requests are proxied by followers.\n\ncluster nodes=3 leader=1\n---\nn1@1 leader last=1@1 commit=1@1 app"
},
{
"path": "src/raft/testscripts/node/request_follower_campaign_abort",
"chars": 1043,
"preview": "# A follower aborts in-flight requests when it steps down.\n\ncluster nodes=3 leader=1\n---\nn1@1 leader last=1@1 commit=1@1"
},
{
"path": "src/raft/testscripts/node/request_follower_disconnect_stall",
"chars": 1187,
"preview": "# Client read/write requests stall if the follower is disconnected from the\n# leader when the request is submitted. They"
},
{
"path": "src/raft/testscripts/node/request_follower_leaderless_abort",
"chars": 640,
"preview": "# Client read/write requests fail on leaderless followers.\n\ncluster nodes=3\n---\nn1@0 follower() last=0@0 commit=0@0 appl"
},
{
"path": "src/raft/testscripts/node/request_leader",
"chars": 1290,
"preview": "# Client read/write requests succeed on leaders.\n\ncluster nodes=3 leader=1\n---\nn1@1 leader last=1@1 commit=1@1 applied=1"
},
{
"path": "src/raft/testscripts/node/request_leader_campaign_abort",
"chars": 1061,
"preview": "# A leader aborts in-flight requests when it steps down.\n\ncluster nodes=3 leader=1\n---\nn1@1 leader last=1@1 commit=1@1 a"
},
{
"path": "src/raft/testscripts/node/request_leader_change_linearizability",
"chars": 3125,
"preview": "# A new leader that's behind on commit/apply shouldn't serve stale reads.\n\ncluster nodes=3 leader=1\n---\nn1@1 leader last"
},
{
"path": "src/raft/testscripts/node/request_leader_disconnect",
"chars": 1547,
"preview": "# Client read/write requests succeed if the leader is disconnected from the\n# quorum when the request is submitted but i"
},
{
"path": "src/raft/testscripts/node/request_leader_read_quorum",
"chars": 948,
"preview": "# Client read requests are only processed once a quorum confirms the read sequence.\n\ncluster nodes=5 leader=1\n---\nn1@1 l"
},
{
"path": "src/raft/testscripts/node/request_leader_read_quorum_sequence",
"chars": 2265,
"preview": "# Client read requests are only served once a quorum confirm the read sequence\n# number, including higher sequence numbe"
},
{
"path": "src/raft/testscripts/node/request_leader_single",
"chars": 770,
"preview": "# Client read/write requests succeed on a lone leader.\n\ncluster nodes=1\n---\nn1@1 leader last=1@1 commit=1@1 applied=1 pr"
},
{
"path": "src/raft/testscripts/node/request_status",
"chars": 2140,
"preview": "# Status requests return the cluster status.\n\ncluster nodes=3 leader=1\n---\nn1@1 leader last=1@1 commit=1@1 applied=1 pro"
},
{
"path": "src/raft/testscripts/node/request_status_single",
"chars": 857,
"preview": "# Status requests return the cluster status on a single node.\n\ncluster nodes=1\n---\nn1@1 leader last=1@1 commit=1@1 appli"
},
{
"path": "src/raft/testscripts/node/restart",
"chars": 3080,
"preview": "# Restarting a cluster that's fully caught up retains the existing state and\n# allows trivially electing a new leader.\n\n"
},
{
"path": "src/raft/testscripts/node/restart_apply",
"chars": 880,
"preview": "# Restarting a node and wiping its state machine will reapply the state.\n\ncluster nodes=3 leader=1\n---\nn1@1 leader last="
},
{
"path": "src/raft/testscripts/node/restart_commit_recover",
"chars": 2152,
"preview": "# Restarting the cluster and wiping the commit indexes allows\n# a new leader to recover the commit index.\n\ncluster nodes"
},
{
"path": "src/raft/testscripts/node/restart_term_vote",
"chars": 1052,
"preview": "# The term/vote is retained across a restart.\n\ncluster nodes=3\n---\nn1@0 follower() last=0@0 commit=0@0 applied=0\nn2@0 fo"
},
{
"path": "src/raft/testscripts/node/tick_candidate",
"chars": 731,
"preview": "# Ticking a candidate will eventually hold a new election in a later term.\n\ncluster nodes=3 heartbeat_interval=1 electio"
},
{
"path": "src/raft/testscripts/node/tick_follower",
"chars": 1080,
"preview": "# Ticking a follower will transition it to candidate if it hasn't\n# heard from the leader in a while.\n\ncluster nodes=3 l"
},
{
"path": "src/raft/testscripts/node/tick_follower_leaderless",
"chars": 613,
"preview": "# Ticking a leaderless follower will eventually transition it to candidate.\n\ncluster nodes=3 heartbeat_interval=1 electi"
},
{
"path": "src/raft/testscripts/node/tick_leader",
"chars": 982,
"preview": "# Ticking a leader should cause it to emit heartbeats, even when it doesn't\n# hear back from any followers.\n\ncluster nod"
},
{
"path": "src/server.rs",
"chars": 13318,
"preview": "use std::collections::HashMap;\nuse std::io::{BufReader, BufWriter, Write as _};\nuse std::net::{TcpListener, TcpStream, T"
},
{
"path": "src/sql/engine/engine.rs",
"chars": 3732,
"preview": "use std::collections::{BTreeMap, BTreeSet};\n\nuse crate::errinput;\nuse crate::error::Result;\nuse crate::sql::execution::S"
},
{
"path": "src/sql/engine/local.rs",
"chars": 15487,
"preview": "use std::borrow::Cow;\nuse std::collections::{BTreeMap, BTreeSet};\nuse std::slice;\n\nuse itertools::Itertools as _;\nuse se"
},
{
"path": "src/sql/engine/mod.rs",
"chars": 440,
"preview": "//! The SQL engine provides SQL data storage and access, as well as session and\n//! transaction management. The `Local` "
},
{
"path": "src/sql/engine/raft.rs",
"chars": 15188,
"preview": "use std::borrow::Cow;\nuse std::collections::{BTreeMap, BTreeSet};\n\nuse crossbeam::channel::Sender;\nuse serde::de::Deseri"
},
{
"path": "src/sql/execution/aggregator.rs",
"chars": 5036,
"preview": "use std::collections::BTreeMap;\n\nuse itertools::Itertools as _;\n\nuse crate::error::Result;\nuse crate::sql::planner::Aggr"
},
{
"path": "src/sql/execution/executor.rs",
"chars": 13555,
"preview": "use std::cmp::Ordering;\nuse std::collections::{BTreeMap, HashMap};\n\nuse itertools::{Itertools as _, izip};\n\nuse super::a"
},
{
"path": "src/sql/execution/join.rs",
"chars": 6755,
"preview": "use std::collections::HashMap;\nuse std::iter::Peekable;\n\nuse crate::errinput;\nuse crate::error::Result;\nuse crate::sql::"
},
{
"path": "src/sql/execution/mod.rs",
"chars": 182,
"preview": "//! Executes statements and plans.\n\nmod aggregator;\nmod executor;\nmod join;\nmod session;\n\npub use executor::{ExecutionRe"
},
{
"path": "src/sql/execution/session.rs",
"chars": 8626,
"preview": "use itertools::Itertools as _;\nuse log::error;\nuse serde::{Deserialize, Serialize};\n\nuse crate::error::{Error, Result};\n"
},
{
"path": "src/sql/mod.rs",
"chars": 14309,
"preview": "//! Implements a SQL execution engine. A SQL statement flows through the engine\n//! as follows:\n//!\n//! 1. The `toySQL` "
},
{
"path": "src/sql/parser/ast.rs",
"chars": 10951,
"preview": "use std::collections::BTreeMap;\nuse std::hash::{Hash, Hasher};\n\nuse crate::sql::types::DataType;\n\n/// SQL statements are"
},
{
"path": "src/sql/parser/lexer.rs",
"chars": 16075,
"preview": "use std::fmt::Display;\nuse std::iter::Peekable;\nuse std::str::Chars;\n\nuse crate::errinput;\nuse crate::error::Result;\n\n//"
},
{
"path": "src/sql/parser/mod.rs",
"chars": 179,
"preview": "//! Parses raw SQL strings into a structured Abstract Syntax Tree.\n\npub mod ast;\nmod lexer;\nmod parser;\n\npub use lexer::"
},
{
"path": "src/sql/parser/parser.rs",
"chars": 35586,
"preview": "use std::iter::Peekable;\nuse std::ops::Add;\n\nuse super::{Keyword, Lexer, Token, ast};\nuse crate::errinput;\nuse crate::er"
},
{
"path": "src/sql/planner/mod.rs",
"chars": 307,
"preview": "//! The planner builds and optimizes an execution plan based on a SQL\n//! statement's Abstract Syntax Tree (AST) generat"
},
{
"path": "src/sql/planner/optimizer.rs",
"chars": 17699,
"preview": "use std::collections::HashMap;\nuse std::fmt::Debug;\nuse std::sync::LazyLock;\n\nuse super::Node;\nuse crate::error::Result;"
},
{
"path": "src/sql/planner/plan.rs",
"chars": 26956,
"preview": "use std::collections::HashMap;\nuse std::fmt::Display;\n\nuse itertools::Itertools as _;\nuse serde::{Deserialize, Serialize"
},
{
"path": "src/sql/planner/planner.rs",
"chars": 36516,
"preview": "use std::collections::{BTreeMap, HashMap, HashSet};\n\nuse itertools::{Either, Itertools as _};\n\nuse super::plan::{Aggrega"
},
{
"path": "src/sql/testscripts/expressions/cnf",
"chars": 844,
"preview": "# Tests conversion of logical expressions into canonical normal form.\n\n# Noop for non-boolean expressions.\n[cnf]> 1 + 2\n"
},
{
"path": "src/sql/testscripts/expressions/func",
"chars": 771,
"preview": "# Tests function calls.\n\n# Function names are case-insensitive.\n> sqrt(1)\n> SQRT(1)\n---\n1.0\n1.0\n\n# A space is allowed ar"
},
{
"path": "src/sql/testscripts/expressions/func_sqrt",
"chars": 799,
"preview": "# Tests sqrt().\n\n# Integers work, and return floats.\n[expr]> sqrt(2)\n[expr]> sqrt(100)\n---\n1.4142135623730951 ← SquareRo"
},
{
"path": "src/sql/testscripts/expressions/literals",
"chars": 1708,
"preview": "# Tests parsing and evaluation of literals and constants.\n\n# Boolean and float constants.\ntrue\nfalse\nnull\ninfinity\nnan\n-"
},
{
"path": "src/sql/testscripts/expressions/op_compare_equal",
"chars": 837,
"preview": "# Tests the = equality operator.\n\n# Booleans.\n> TRUE = TRUE\n> TRUE = FALSE\n> FALSE = TRUE\n---\nTRUE\nFALSE\nFALSE\n\n# Intege"
},
{
"path": "src/sql/testscripts/expressions/op_compare_greater",
"chars": 1346,
"preview": "# Tests the > greater than operator.\n\n# Booleans.\n> TRUE > FALSE\n> FALSE > TRUE\n> TRUE > TRUE\n> FALSE > FALSE\n---\nTRUE\nF"
},
{
"path": "src/sql/testscripts/expressions/op_compare_greater_equal",
"chars": 1348,
"preview": "# Tests the >= greater than operator.\n\n# This is implemented as > OR =, just verify this for a few basic cases.\n\n[expr]>"
},
{
"path": "src/sql/testscripts/expressions/op_compare_is_nan",
"chars": 383,
"preview": "# Tests the IS NAN equality operator.\n\n> 0.0 IS NAN\n> NAN IS NAN\n> NULL IS NAN\n---\nFALSE\nTRUE\nNULL\n\n!> FALSE IS NAN\n!> 0"
},
{
"path": "src/sql/testscripts/expressions/op_compare_is_null",
"chars": 186,
"preview": "# Tests the IS NULL equality operator.\n\n> FALSE IS NULL\n> 0 IS NULL\n> 0.0 IS NULL\n> '' IS NULL\n> 'null' IS NULL\n> NAN IS"
},
{
"path": "src/sql/testscripts/expressions/op_compare_lesser",
"chars": 1345,
"preview": "# Tests the < less than operator.\n\n# Booleans.\n> FALSE < TRUE\n> TRUE < FALSE\n> TRUE < TRUE\n> FALSE < FALSE\n---\nTRUE\nFALS"
},
{
"path": "src/sql/testscripts/expressions/op_compare_lesser_equal",
"chars": 1327,
"preview": "# Tests the <= less than or equal operator.\n\n# This is implemented as < OR =, just verify this for a few basic cases.\n\n["
},
{
"path": "src/sql/testscripts/expressions/op_compare_not_equal",
"chars": 772,
"preview": "# Tests the != inequality operator.\n\n# != is a combination of NOT and =, just verify that for a few basic cases.\n\n[expr]"
},
{
"path": "src/sql/testscripts/expressions/op_logic_and",
"chars": 862,
"preview": "# Tests the AND logical operator.\n\n# Basic truth table.\n> TRUE AND TRUE\n> TRUE AND FALSE\n> FALSE AND TRUE\n> FALSE AND FA"
},
{
"path": "src/sql/testscripts/expressions/op_logic_not",
"chars": 255,
"preview": "# Tests the NOT logical operator.\n\n> NOT TRUE\n> NOT FALSE\n> NOT NULL\n---\nFALSE\nTRUE\nNULL\n\n# Non-booleans.\n!> NOT 1\n!> NO"
},
{
"path": "src/sql/testscripts/expressions/op_logic_or",
"chars": 830,
"preview": "# Tests the OR logical operator.\n\n# Basic truth table.\n> TRUE OR TRUE\n> TRUE OR FALSE\n> FALSE OR TRUE\n> FALSE OR FALSE\n-"
},
{
"path": "src/sql/testscripts/expressions/op_math_add",
"chars": 1301,
"preview": "# Tests the + addition operator.\n\n# Simple integer addition.\n[expr]> 1 + 2\n[expr]> 1 + -3\n[expr]> 1 + -2 + 3\n---\n3 ← Add"
},
{
"path": "src/sql/testscripts/expressions/op_math_divide",
"chars": 1460,
"preview": "# Tests the / division operator.\n\n# Integers.\n[expr]> 9 / 3\n[expr]> 8 / 3\n[expr]> 8 / -3\n---\n3 ← Divide(Constant(Integer"
},
{
"path": "src/sql/testscripts/expressions/op_math_exponentiate",
"chars": 1521,
"preview": "# Tests the ^ exponentiation operator.\n\n# Integers.\n[expr]> 2 ^ 3\n[expr]> 2 ^ 0\n[expr]> 0 ^ 2\n[expr]> 9 ^ -3\n---\n8 ← Exp"
},
{
"path": "src/sql/testscripts/expressions/op_math_factorial",
"chars": 964,
"preview": "# Tests the ! factorial suffix operator.\n\n# Integer works.\n[expr]> 3!\n---\n6 ← Factorial(Constant(Integer(3)))\n\n# But flo"
},
{
"path": "src/sql/testscripts/expressions/op_math_identity",
"chars": 499,
"preview": "# Tests the + identity prefix operator.\n\n# Integer and float works.\n[expr]> +1\n[expr]> +3.14\n---\n1 ← Identity(Constant(I"
},
{
"path": "src/sql/testscripts/expressions/op_math_multiply",
"chars": 1172,
"preview": "# Tests the * multiplication operator.\n\n# Integers.\n[expr]> 2 * 3\n[expr]> 2 * -3\n---\n6 ← Multiply(Constant(Integer(2)), "
},
{
"path": "src/sql/testscripts/expressions/op_math_negate",
"chars": 536,
"preview": "# Tests the - negation prefix operator.\n\n# Integer and float works.\n[expr]> -1\n[expr]> -3.14\n---\n-1 ← Negate(Constant(In"
},
{
"path": "src/sql/testscripts/expressions/op_math_remainder",
"chars": 1180,
"preview": "# Tests the % remainder operator.\n#\n# Note that remainder is not the same as modulo: the former has the sign of the\n# di"
},
{
"path": "src/sql/testscripts/expressions/op_math_subtract",
"chars": 1386,
"preview": "# Tests the - subtraction operator.\n\n# Simple integer subtraction.\n[expr]> 2 - 1\n[expr]> 2 - 3\n[expr]> 1 - -3 - 2\n---\n1 "
},
{
"path": "src/sql/testscripts/expressions/op_precedence",
"chars": 6584,
"preview": "# Tests operator precedence. Test each precedence level against the operators\n# beside and immediately below it, in orde"
},
{
"path": "src/sql/testscripts/expressions/op_string_like",
"chars": 1981,
"preview": "# Tests the LIKE string pattern matching operator.\n\n# Multi-character matches.\n> 'abcde' LIKE 'a%e'\n> 'abcde' LIKE 'abc%"
},
{
"path": "src/sql/testscripts/optimizers/constant_folder",
"chars": 2970,
"preview": "# Tests the constant folding optimizer.\n\n> CREATE TABLE test (id INT PRIMARY KEY, value STRING)\n> INSERT INTO test VALUE"
},
{
"path": "src/sql/testscripts/optimizers/filter_pushdown",
"chars": 6294,
"preview": "# Tests filter pushdown.\n\n> CREATE TABLE test (id INT PRIMARY KEY, value STRING)\n> INSERT INTO test VALUES (1, 'a'), (2,"
},
{
"path": "src/sql/testscripts/optimizers/hash_join",
"chars": 2120,
"preview": "# Tests the switch to hash joins where appropriate.\n\n> CREATE TABLE test (id INT PRIMARY KEY, value STRING)\n> INSERT INT"
},
{
"path": "src/sql/testscripts/optimizers/index_lookup",
"chars": 7630,
"preview": "# Tests the index_lookup optimizer.\n\n> CREATE TABLE test (id INT PRIMARY KEY, value STRING INDEX, \"float\" FLOAT INDEX)\n>"
},
{
"path": "src/sql/testscripts/optimizers/short_circuit",
"chars": 4368,
"preview": "# Tests the short circuiting optimizer.\n\n> CREATE TABLE test (id INT PRIMARY KEY, value STRING)\n> INSERT INTO test VALUE"
},
{
"path": "src/sql/testscripts/queries/aggregate",
"chars": 6372,
"preview": "# Tests aggregate functions.\n\n> CREATE TABLE test ( \\\n id INT PRIMARY KEY, \\\n \"bool\" BOOLEAN, \\\n \"int\" INTEGER,"
},
{
"path": "src/sql/testscripts/queries/clauses",
"chars": 2143,
"preview": "# Tests the ordering of SELECT clauses.\n\n> CREATE TABLE test ( \\\n id INT PRIMARY KEY, \\\n \"bool\" BOOLEAN, \\\n \"fl"
},
{
"path": "src/sql/testscripts/queries/group_by",
"chars": 6372,
"preview": "# Tests GROUP BY clauses. See \"aggregate\" for aggregate function tests.\n\n> CREATE TABLE test ( \\\n id INT PRIMARY KEY,"
},
{
"path": "src/sql/testscripts/queries/having",
"chars": 3857,
"preview": "# Tests HAVING clauses. See \"aggregate\" and \"group_by\" for related tests.\n\n> CREATE TABLE test ( \\\n id INT PRIMARY KE"
},
{
"path": "src/sql/testscripts/queries/join_cross",
"chars": 43164,
"preview": "# Tests cross joins.\n\n# Set up a movies dataset.\n> CREATE TABLE countries ( \\\n id STRING PRIMARY KEY, \\\n name STRI"
},
{
"path": "src/sql/testscripts/queries/join_inner",
"chars": 13080,
"preview": "# Tests inner joins.\n\n# Set up a movies dataset.\n> CREATE TABLE countries ( \\\n id STRING PRIMARY KEY, \\\n name STRI"
},
{
"path": "src/sql/testscripts/queries/join_outer",
"chars": 8208,
"preview": "# Tests left/right outer joins.\n\n# Set up a movies dataset.\n> CREATE TABLE countries ( \\\n id STRING PRIMARY KEY, \\\n "
},
{
"path": "src/sql/testscripts/queries/limit",
"chars": 1587,
"preview": "# Tests LIMIT clauses.\n\n> CREATE TABLE test (id INT PRIMARY KEY, value STRING)\n> INSERT INTO test VALUES (1, 'a'), (2, '"
},
{
"path": "src/sql/testscripts/queries/offset",
"chars": 1648,
"preview": "# Tests OFFSET clauses.\n\n> CREATE TABLE test (id INT PRIMARY KEY, value STRING)\n> INSERT INTO test VALUES (1, 'a'), (2, "
},
{
"path": "src/sql/testscripts/queries/order",
"chars": 10901,
"preview": "# Tests ORDER BY clauses.\n\n# Create a table with representative values of all types.\n> CREATE TABLE test ( \\\n id INT "
},
{
"path": "src/sql/testscripts/queries/select",
"chars": 5861,
"preview": "# Tests the SELECT part of queries.\n\n# Create a basic test table, and a secondary table for join column lookups.\n> CREAT"
},
{
"path": "src/sql/testscripts/queries/where_",
"chars": 2751,
"preview": "# Tests basic WHERE clauses.\n\n> CREATE TABLE test (id INT PRIMARY KEY, value STRING)\n> INSERT INTO test VALUES (1, 'a'),"
},
{
"path": "src/sql/testscripts/queries/where_index",
"chars": 3912,
"preview": "# Tests WHERE index lookups.\n\n# Create a table with representative values of all types.\n> CREATE TABLE test ( \\\n id I"
},
{
"path": "src/sql/testscripts/queries/where_primary_key",
"chars": 2667,
"preview": "# Tests WHERE index lookups.\n\n# Boolean lookups.\n> CREATE TABLE \"bool\" (id BOOL PRIMARY KEY)\n> INSERT INTO \"bool\" VALUES"
},
{
"path": "src/sql/testscripts/schema/create_table",
"chars": 1930,
"preview": "# Tests basic CREATE TABLE functionality.\n\n# The result contains the table name. The table is written to storage. Also\n#"
},
{
"path": "src/sql/testscripts/schema/create_table_datatypes",
"chars": 1016,
"preview": "# Tests CREATE TABLE datatypes.\n\n# Create columns with all datatypes.\n> CREATE TABLE datatypes ( \\\n id INTEGER PRIMAR"
},
{
"path": "src/sql/testscripts/schema/create_table_default",
"chars": 1791,
"preview": "# Tests column defaults.\n\n# All datatypes.\n> CREATE TABLE datatypes ( \\\n id INT PRIMARY KEY, \\\n \"bool\" BOOLEAN DEF"
},
{
"path": "src/sql/testscripts/schema/create_table_index",
"chars": 1430,
"preview": "# Creating a table with an index only results in a single schema entry (no\n# separate index).\n[ops]> CREATE TABLE indexe"
},
{
"path": "src/sql/testscripts/schema/create_table_names",
"chars": 1899,
"preview": "# Tests CREATE TABLE table and column name validation.\n\n# A couple of valid names.\n> CREATE TABLE a_123 (a_123 INTEGER P"
},
{
"path": "src/sql/testscripts/schema/create_table_null",
"chars": 861,
"preview": "# Tests column nullability.\n\n# All datatypes can be nullable. Their default value is NULL.\n> CREATE TABLE datatypes ( \\\n"
},
{
"path": "src/sql/testscripts/schema/create_table_primary_key",
"chars": 1033,
"preview": "# Tests primary keys.\n\n# There must be exactly one primary key.\n!> CREATE TABLE \"primary\" (id INTEGER)\n!> CREATE TABLE \""
},
{
"path": "src/sql/testscripts/schema/create_table_reference",
"chars": 4102,
"preview": "# Tests foreign key references during CREATE TABLE.\n\n# Create two reference tables, with int/string primary keys.\n> CREA"
},
{
"path": "src/sql/testscripts/schema/create_table_transaction",
"chars": 2166,
"preview": "# Tests that CREATE TABLE is transactional.\n\n> BEGIN\n[ops]> CREATE TABLE name (id INT PRIMARY KEY, value STRING)\n---\nset"
},
{
"path": "src/sql/testscripts/schema/create_table_unique",
"chars": 1676,
"preview": "# Creating a table with a unique index only results in a single schema entry (no\n# separate index).\n[ops]> CREATE TABLE "
},
{
"path": "src/sql/testscripts/schema/drop_table",
"chars": 3829,
"preview": "# Basic DROP TABLE tests.\n\n> CREATE TABLE name (id INT PRIMARY KEY, value STRING NOT NULL)\n> INSERT INTO name VALUES (1,"
},
{
"path": "src/sql/testscripts/schema/drop_table_index",
"chars": 18181,
"preview": "# Tests that DROP TABLE cleans up secondary indexes of all kinds.\n\n> CREATE TABLE \"ref\" (id INT PRIMARY KEY, value STRIN"
},
{
"path": "src/sql/testscripts/schema/drop_table_ref",
"chars": 626,
"preview": "# Tests DROP TABLE with references.\n\n# Create a reference table and foreign key table.\n> CREATE TABLE \"ref\" (id INT PRIM"
}
]
// ... and 84 more files (download for full content)
About this extraction
This page contains the full source code of the erikgrinaker/toydb GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 284 files (1.4 MB), approximately 475.6k tokens, and a symbol index with 962 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.