Repository: elastic/elasticsearch-formal-models
Branch: master
Commit: ca30663506a7
Files: 28
Total size: 503.0 KB
Directory structure:
gitextract_nf10lb83/
├── .gitignore
├── LICENSE
├── README.md
├── ReplicaEngine/
│ └── tla/
│ ├── ReplicaEngine.tla
│ └── ReplicaEngine.toolbox/
│ ├── .project
│ ├── .settings/
│ │ └── org.lamport.tla.toolbox.prefs
│ └── ReplicaEngine___model.launch
├── Storage/
│ └── tla/
│ ├── Storage.tla
│ └── Storage.toolbox/
│ └── Storage___model.launch
├── ZenWithTerms/
│ └── tla/
│ ├── ZenWithTerms.tla
│ └── ZenWithTerms.toolbox/
│ ├── .project
│ ├── .settings/
│ │ └── org.lamport.tla.toolbox.prefs
│ └── ZenWithTerms___model.launch
├── cluster/
│ ├── isabelle/
│ │ ├── Implementation.thy
│ │ ├── Monadic.thy
│ │ ├── OneSlot.thy
│ │ ├── Preliminaries.thy
│ │ ├── ROOT
│ │ ├── Zen.thy
│ │ └── document/
│ │ └── root.tex
│ └── tla/
│ ├── consensus.tla
│ └── consensus.toolbox/
│ ├── .project
│ ├── .settings/
│ │ └── org.lamport.tla.toolbox.prefs
│ └── consensus___model.launch
└── data/
└── tla/
├── replication.tla
└── replication.toolbox/
├── .project
├── .settings/
│ └── org.lamport.tla.toolbox.prefs
└── replication___model.launch
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
**/.DS_Store
**/tla/*.toolbox/model
**/tla/*.toolbox/*aux
**/tla/*.toolbox/*.log
**/tla/*.toolbox/*.pdf
**/tla/*.toolbox/*.tex
**/tla/*.toolbox/*___model_SnapShot*.launch
**/tla/*.toolbox/**/*.tla
**/tla/*.toolbox/**/*.out
**/tla/*.toolbox/**/MC.cfg
**/tla/*.pdf
**/tla/*.old
**/*~
cluster/isabelle/output
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
# Formal models of core Elasticsearch algorithms
This repository contains formal models of core [Elasticsearch](https://github.com/elastic/elasticsearch) algorithms and is directly related to implementation efforts around [data replication](https://github.com/elastic/elasticsearch/issues/10708) and [cluster coordination](https://github.com/elastic/elasticsearch/issues/32006). The models in this repository might represent past, current and future designs of Elasticsearch and can differ to their implementations in substantial ways. The formal models mainly serve to illustrate some of the high-level concepts and help to validate resiliency-related aspects.
## Models
### Cluster coordination model
The cluster coordination TLA+ model ensures the consistency of cluster state updates and represents the core [cluster coordination](https://github.com/elastic/elasticsearch/issues/32006) and metadata replication algorithm implemented in Elasticsearch 7.0. It consists of two files:
- [TLA+ specification](ZenWithTerms/tla/ZenWithTerms.tla) which has a [direct one-to-one implementation in Elasticsearch](https://github.com/elastic/elasticsearch/blob/master/server/src/main/java/org/elasticsearch/cluster/coordination/CoordinationState.java)
- [TLC model checking configuration](ZenWithTerms/tla/ZenWithTerms.toolbox/ZenWithTerms___model.launch)
### Data replication model
The data replication TLA+ model describes the Elasticsearch [sequence number](https://github.com/elastic/elasticsearch/issues/10708) based data replication approach, implemented since Elasticsearch 6.0, which consists of two files:
- [TLA+ specification](data/tla/replication.tla)
- [TLC model checking configuration](data/tla/replication.toolbox/replication___model.launch)
### Replica engine
A TLA+ model of how the
[engine](https://github.com/elastic/elasticsearch/blob/00fd73acc4a2991f96438f8c1948016c5b9eefb2/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java)
handles replication requests.
- [TLA+ specification](ReplicaEngine/tla/ReplicaEngine.tla)
- [TLC model checking configuration](ReplicaEngine/tla/ReplicaEngine.toolbox/ReplicaEngine___model.launch)
### Alternative cluster coordination model
The alternative cluster coordination TLA+ model consists of two files:
- [TLA+ specification](cluster/tla/consensus.tla)
- [TLC model checking configuration](cluster/tla/consensus.toolbox/consensus___model.launch)
The alternative cluster consensus Isabelle model consists of the following theories:
- [Basic definitions](cluster/isabelle/Preliminaries.thy)
- [An implementation in functional style](cluster/isabelle/Implementation.thy)
- [An implementation in monadic style, along with a proof it's equivalent to the previous](cluster/isabelle/Monadic.thy)
- [The proof that each slot is consistent, based on Lamport's Synod algorithm](cluster/isabelle/OneSlot.thy)
- [The proof that the implementation ensures consistency](cluster/isabelle/Zen.thy)
## How to edit/run TLA+:
- Install the [TLA Toolbox](http://research.microsoft.com/en-us/um/people/lamport/tla/toolbox.html)
- If on Mac OS, [move the downloaded app to the Applications folder first](https://groups.google.com/forum/#!topic/tlaplus/bL04c6BiYxo)
- Read some [documentation](http://research.microsoft.com/en-us/um/people/lamport/tla/book.html)
How to run the model checker in headless mode:
- Download [tla2tools.jar](http://research.microsoft.com/en-us/um/people/lamport/tla/tools.html)
- Run the model checker once in TLA+ Toolbox on desktop (can be aborted once started). This generates the folder `elasticsearch.toolbox/model/` that contains all model files that are required to run the model checker in headless mode.
- Copy the above folder and `tla2tools.jar` to the server running in headless mode.
- `cd` to the folder and run `java -Xmx30G -cp ../tla2tools.jar tlc2.TLC MC -deadlock -workers 12`. The setting `-Xmx30G` denotes the amount of memory to allocate to the model checker and `-workers 12` the number of worker threads (should be equal to the number of cores on machine). The setting `-deadlock` ensures that TLC explores the full reachable state space, not searching for deadlocks.
================================================
FILE: ReplicaEngine/tla/ReplicaEngine.tla
================================================
-------------------------- MODULE ReplicaEngine --------------------------
EXTENDS Naturals, FiniteSets, Sequences, TLC
(* Actions on the Lucene index *)
CONSTANTS Lucene_addDocuments, Lucene_updateDocuments, Lucene_deleteDocuments
CONSTANTS ADD, RETRY_ADD, UPDATE, DELETE, NULL
CONSTANTS DocContent
CONSTANTS DocAutoIdTimestamp
CONSTANTS DuplicationLimit
(* We model the activity of a single document, since distinct documents
(according to their IDs) are independent. Also each indexing operation
occurs under a lock for that document ID, so there is not much concurrency
to consider. *)
(* The set of individual requests that can occur on the document *)
Request(request_count)
(* ADD: An optimised append-only write can only occur as the first operation
on the document ID in seqno order. Any subsequent attempts to ADD the
document have the retry flag set and modelled as a RETRY_ADD. Other operations
on the document are also possible. *)
== [type : {ADD}, seqno : {1}, content : DocContent, autoIdTimeStamp : {DocAutoIdTimestamp}]
(* RETRY_ADD: A retry of a write that does involve an internally-generated
document ID. *)
\cup [type : {RETRY_ADD}, seqno : 1..request_count, content : DocContent, autoIdTimeStamp : {DocAutoIdTimestamp}]
(* UPDATE: A write that does not involve an internally-generated document ID. *)
\cup [type : {UPDATE}, seqno : 1..request_count, content : DocContent]
(* DELETE *)
\cup [type : {DELETE}, seqno : 1..request_count]
(* The set of sets of requests, which have distinct seqnos *)
RequestSet(request_count)
== { rs \in SUBSET Request(request_count):
/\ Cardinality(rs) = request_count
/\ Cardinality({r.seqno : r \in rs}) = request_count
/\ (* Also ADDs and RETRY_ADDs should have the same content *)
Cardinality({r.content: r \in { r \in rs: r.type \in {ADD, RETRY_ADD}}}) <= 1
}
(* Apply a set of operations to a document in seqno order *)
RECURSIVE ApplyOps(_, _, _)
ApplyOps(requests, nextSeqno, currentContent)
== IF \A r \in requests: r.seqno < nextSeqno
THEN currentContent
ELSE LET r == CHOOSE r \in requests: r.seqno = nextSeqno
IN IF r \in requests /\ r.seqno = nextSeqno
THEN ApplyOps(requests, nextSeqno + 1,
CASE r.type = DELETE -> NULL
[] r.type = ADD -> r.content
[] r.type = RETRY_ADD -> r.content
[] r.type = UPDATE -> r.content)
ELSE Assert(FALSE, "Bad sequence")
(* Calculate the final doc by applying all the requests in order *)
FinalDoc(requests) == ApplyOps(requests, 1, NULL)
(* Apply each the operation in the Lucene buffer, rejecting an
addDocuments when there is already a document present as this
would lead to duplication. *)
RECURSIVE ApplyBufferedOperations(_, _)
ApplyBufferedOperations(buffer, origDoc)
== IF buffer = <<>>
THEN origDoc
ELSE LET nextOp == Head(buffer)
IN ApplyBufferedOperations(Tail(buffer),
CASE nextOp.type = Lucene_deleteDocuments -> NULL
[] \/ nextOp.type = Lucene_updateDocuments
\/ /\ nextOp.type = Lucene_addDocuments
/\ origDoc = NULL -> [content |-> nextOp.content, seqno |-> nextOp.seqno]
[] OTHER -> Assert(FALSE, "Error: Lucene_addDocuments when origDoc /= NULL"))
Max(a,b) == IF a <= b THEN b ELSE a
(* --algorithm basic
variables
request_count \in 1..4,
replication_requests \in RequestSet(request_count),
expected_doc = FinalDoc(replication_requests),
versionMap_needsSafeAccess = FALSE,
versionMap_isUnsafe = FALSE,
versionMap_entry = NULL,
(* Other concurrent activity can flag that the version map needs to be safely accessed *)
process SafeAccessEnablerProcess = "SafeAccessEnabler"
begin
SafeAccessEnablerLoop:
while pc["Consumer"] /= "Done" do
versionMap_needsSafeAccess := (versionMap_needsSafeAccess = FALSE);
(* Technically the only way this can go back to FALSE is via a refresh, but
we should not need this fact, so model both kinds of change. *)
end while;
end process;
(* Other concurrent activity can make the version map become unsafe, if safe access mode is disabled *)
process UnsafePutterProcess = "UnsafePutter"
begin
UnsafePutterLoop:
while pc["Consumer"] /= "Done" do
await versionMap_needsSafeAccess = FALSE;
versionMap_isUnsafe := TRUE;
end while;
end process;
(* Other concurrent activity can increase the maxUnsafeAutoIdTimestamp *)
process MaxUnsafeAutoIdTimestampIncreaserProcess = "MaxUnsafeAutoIdTimestampIncreaser"
begin
MaxUnsafeAutoIdTimestampIncreaserLoop:
while pc["Consumer"] /= "Done" do
with newTimestamp \in {DocAutoIdTimestamp - 1, DocAutoIdTimestamp, DocAutoIdTimestamp + 1} do
await maxUnsafeAutoIdTimestamp < newTimestamp;
maxUnsafeAutoIdTimestamp := newTimestamp;
end with;
end while;
end process;
(* Lucene refreshes can happen at any time *)
process LuceneProcess = "ReplicaLucene"
variables
lucene_document = NULL,
lucene_buffer = <<>>,
begin
LuceneLoop:
while pc["Consumer"] /= "Done" \/ lucene_buffer /= <<>> do
lucene_document := ApplyBufferedOperations(lucene_buffer, lucene_document);
lucene_buffer := <<>>;
(* TODO Model the inner structure of the version map so this refresh can be
broken into the individual steps that occur concurrently with ongoing indexing. *)
versionMap_isUnsafe := FALSE;
versionMap_needsSafeAccess := FALSE;
if versionMap_entry /= NULL
then
if versionMap_entry.type = UPDATE
then
versionMap_entry := NULL;
else
assert versionMap_entry.type = DELETE;
versionMap_entry := [ versionMap_entry EXCEPT !.flushed = TRUE ];
end if;
end if;
end while;
end process;
(* Flushed deletes expire after a time and are cleaned up *)
process DeleteCollectorProcess = "DeleteCollector"
begin
DeleteCollectorLoop:
while pc["Consumer"] /= "Done" do
await /\ versionMap_entry /= NULL
/\ versionMap_entry.type = DELETE
/\ versionMap_entry.seqno <= localCheckPoint \* PR #28790
/\ versionMap_entry.flushed = TRUE;
versionMap_entry := NULL;
end while;
end process;
(* Local checkpoint advances as each operation is marked as completed *)
process LocalCheckpointTrackerProcess = "LocalCheckpointTracker"
variables
localCheckPoint = 0,
completedSeqnos = {}
begin
LocalCheckpointTrackerLoop:
while pc["Consumer"] /= "Done" do
await localCheckPoint + 1 \in completedSeqnos;
localCheckPoint := localCheckPoint + 1;
end while;
end process
process UnsafeSeqnoIncreaserProcess = "UnsafeSeqnoIncreaserProcess"
variables
maxSeqNoOfNonAppendOnlyOperations = 0,
begin
UnsafeSeqnoIncreaserProcessLoop:
while pc["Consumer"] /= "Done" /\ maxSeqNoOfNonAppendOnlyOperations < request_count + 1 do
maxSeqNoOfNonAppendOnlyOperations := maxSeqNoOfNonAppendOnlyOperations + 1;
end while;
end process
(* The process that consumes replication requests for a particular document ID, which
are processed in series because of the lock in the version map. *)
process ConsumerProcess = "Consumer"
variables
duplicationCount = 0,
maxUnsafeAutoIdTimestamp \in {0, DocAutoIdTimestamp - 1, DocAutoIdTimestamp, DocAutoIdTimestamp + 1},
req, plan,
deleteFromLucene, currentlyDeleted,
currentNotFoundOrDeleted, useLuceneUpdateDocument, indexIntoLucene,
begin
ConsumerLoop:
while replication_requests /= {} do
with replication_request \in replication_requests do
if replication_request.type = ADD
then
(* Never see two ADDs - if duplicated, one of them is a RETRY_ADD *)
either
(* Process ADD without duplication *)
replication_requests := replication_requests \ {replication_request};
req := replication_request;
or
await duplicationCount < DuplicationLimit;
duplicationCount := duplicationCount + 1;
(* Process ADD and leave a duplicate RETRY_ADD for later *)
replication_requests := (replication_requests \ {replication_request})
\cup {[replication_request EXCEPT !.type = RETRY_ADD]};
req := replication_request;
or
await duplicationCount < DuplicationLimit;
duplicationCount := duplicationCount + 1;
(* Process duplicate RETRY_ADD and leave the original ADD *)
req := [replication_request EXCEPT !.type = RETRY_ADD];
end either;
else
req := replication_request;
either
await duplicationCount < DuplicationLimit;
duplicationCount := duplicationCount + 1;
or
replication_requests := replication_requests \ {replication_request};
end either;
end if;
end with;
if req.type = DELETE
then
versionMap_needsSafeAccess := TRUE;
(* planDeletionAsNonPrimary *)
maxSeqNoOfNonAppendOnlyOperations := Max(maxSeqNoOfNonAppendOnlyOperations, req.seqno);
if req.seqno <= localCheckPoint
then
(* OP_STALE_OR_EQUAL *)
plan := "processButSkipLucene";
deleteFromLucene := FALSE;
currentlyDeleted := FALSE;
else
if versionMap_isUnsafe
then
(* Perform a Lucene refresh *)
AwaitRefreshOnDelete: \* Label here to allow for other concurrent activity
await lucene_buffer = <<>>;
versionMap_needsSafeAccess := TRUE;
end if;
compareDeleteOpToLuceneDocBasedOnSeqNo: \* Label needed because of AwaitRefreshOnDelete label
if versionMap_entry /= NULL
then
(* Doc is in version map *)
if req.seqno > versionMap_entry.seqno
then
(* OP_NEWER *)
plan := "processNormally";
deleteFromLucene := TRUE;
currentlyDeleted := FALSE;
else
(* OP_STALE_OR_EQUAL *)
plan := "processButSkipLucene";
deleteFromLucene := FALSE;
currentlyDeleted := FALSE;
end if;
else
(* Doc is not in version map - check Lucene *)
if lucene_document = NULL
then
(* LUCENE_DOC_NOT_FOUND *)
plan := "processNormallyExceptNotFound";
deleteFromLucene := TRUE;
currentlyDeleted := TRUE;
else
if req.seqno > lucene_document.seqno
then
(* OP_NEWER *)
plan := "processNormally";
deleteFromLucene := TRUE;
currentlyDeleted := FALSE;
else
(* OP_STALE_OR_EQUAL *)
plan := "processButSkipLucene";
deleteFromLucene := FALSE;
currentlyDeleted := FALSE;
end if;
end if;
end if;
end if;
ExecuteDeletePlan: \* Label needed because of AwaitRefreshOnDelete label
if deleteFromLucene
then
if currentlyDeleted = FALSE
then
lucene_buffer := Append(lucene_buffer, [ type |-> Lucene_deleteDocuments ]);
end if;
versionMap_entry := [ type |-> DELETE, seqno |-> req.seqno, flushed |-> FALSE ];
end if;
completedSeqnos := completedSeqnos \cup {req.seqno};
else
(* planIndexingAsNonPrimary *)
(* A RETRY_ADD has canOptimiseAddDocument = TRUE and
mayHaveBeenIndexedBefore = TRUE so is planned normally,
but also updates maxUnsafeAutoIdTimestamp within
mayHaveBeenIndexedBefore() *)
if req.type = RETRY_ADD
then
maxUnsafeAutoIdTimestamp := Max(maxUnsafeAutoIdTimestamp, req.autoIdTimeStamp);
end if;
(* An ADD can be optimized if mayHaveBeenIndexedBefore = FALSE
which is calculated by comparing timestamps. *)
if /\ req.type = ADD
/\ maxUnsafeAutoIdTimestamp < req.autoIdTimeStamp
/\ maxSeqNoOfNonAppendOnlyOperations < req.seqno \* PR #28787
then
plan := "optimisedAppendOnly";
currentNotFoundOrDeleted := TRUE;
useLuceneUpdateDocument := FALSE;
indexIntoLucene := TRUE;
else
if req.type \notin {ADD, RETRY_ADD}
then
maxSeqNoOfNonAppendOnlyOperations := Max(maxSeqNoOfNonAppendOnlyOperations, req.seqno);
end if;
(* All other operations are planned normally *)
versionMap_needsSafeAccess := TRUE;
if req.seqno <= localCheckPoint
then
(* OP_STALE_OR_EQUAL *)
plan := "processButSkipLucene";
currentNotFoundOrDeleted := FALSE;
useLuceneUpdateDocument := FALSE;
indexIntoLucene := FALSE;
else
if versionMap_isUnsafe
then
(* Perform a Lucene refresh *)
AwaitRefreshOnIndex: \* Label here to allow for other concurrent activity
await lucene_buffer = <<>>;
versionMap_needsSafeAccess := TRUE;
end if;
compareIndexOpToLuceneDocBasedOnSeqNo: \* Label needed because of AwaitRefreshOnIndex label
if req.seqno <= localCheckPoint \* PR #29276
then \* PR #29276
(* OP_STALE_OR_EQUAL *) \* PR #29276
plan := "processButSkipLucene"; \* PR #29276
currentNotFoundOrDeleted := FALSE; \* PR #29276
useLuceneUpdateDocument := FALSE; \* PR #29276
indexIntoLucene := FALSE; \* PR #29276
elsif versionMap_entry /= NULL
then
(* Doc is in version map *)
if req.seqno > versionMap_entry.seqno
then
(* OP_NEWER *)
plan := "processNormally";
currentNotFoundOrDeleted := FALSE;
useLuceneUpdateDocument := TRUE;
indexIntoLucene := TRUE;
else
(* OP_STALE_OR_EQUAL *)
plan := "processButSkipLucene";
currentNotFoundOrDeleted := FALSE;
useLuceneUpdateDocument := FALSE;
indexIntoLucene := FALSE;
end if;
else
(* Doc is not in version map - check Lucene *)
if lucene_document = NULL
then
(* LUCENE_DOC_NOT_FOUND *)
plan := "processNormallyExceptNotFound";
currentNotFoundOrDeleted := TRUE;
useLuceneUpdateDocument := FALSE;
indexIntoLucene := TRUE;
else
if req.seqno > lucene_document.seqno
then
(* OP_NEWER *)
plan := "processNormally";
currentNotFoundOrDeleted := FALSE;
useLuceneUpdateDocument := TRUE;
indexIntoLucene := TRUE;
else
(* OP_STALE_OR_EQUAL *)
plan := "processButSkipLucene";
currentNotFoundOrDeleted := FALSE;
useLuceneUpdateDocument := FALSE;
indexIntoLucene := FALSE;
end if;
end if;
end if;
end if;
end if;
(* planIndexingAsNonPrimary finished - now time to execute the plan *)
ExecuteIndexPlan: \* Label needed because of AwaitRefreshOnIndex label
if indexIntoLucene
then
lucene_buffer := Append(lucene_buffer,
[ type |-> IF useLuceneUpdateDocument THEN Lucene_updateDocuments ELSE Lucene_addDocuments
, seqno |-> req.seqno
, content |-> req.content
]);
if versionMap_needsSafeAccess
then
versionMap_entry := [ type |-> UPDATE, seqno |-> req.seqno ];
else
versionMap_isUnsafe := TRUE;
if /\ versionMap_entry /= NULL
/\ versionMap_entry.type = DELETE
/\ versionMap_entry.seqno < req.seqno
then
versionMap_entry := NULL; \* Desync bug #3 (no PR number yet)
end if;
end if;
end if;
completedSeqnos := completedSeqnos \cup {req.seqno}
end if;
end while;
end process
end algorithm
*)
\* BEGIN TRANSLATION
CONSTANT defaultInitValue
VARIABLES request_count, replication_requests, expected_doc,
versionMap_needsSafeAccess, versionMap_isUnsafe, versionMap_entry,
pc, lucene_document, lucene_buffer, localCheckPoint,
completedSeqnos, maxSeqNoOfNonAppendOnlyOperations,
duplicationCount, maxUnsafeAutoIdTimestamp, req, plan,
deleteFromLucene, currentlyDeleted, currentNotFoundOrDeleted,
useLuceneUpdateDocument, indexIntoLucene
vars == << request_count, replication_requests, expected_doc,
versionMap_needsSafeAccess, versionMap_isUnsafe, versionMap_entry,
pc, lucene_document, lucene_buffer, localCheckPoint,
completedSeqnos, maxSeqNoOfNonAppendOnlyOperations,
duplicationCount, maxUnsafeAutoIdTimestamp, req, plan,
deleteFromLucene, currentlyDeleted, currentNotFoundOrDeleted,
useLuceneUpdateDocument, indexIntoLucene >>
ProcSet == {"SafeAccessEnabler"} \cup {"UnsafePutter"} \cup {"MaxUnsafeAutoIdTimestampIncreaser"} \cup {"ReplicaLucene"} \cup {"DeleteCollector"} \cup {"LocalCheckpointTracker"} \cup {"UnsafeSeqnoIncreaserProcess"} \cup {"Consumer"}
Init == (* Global variables *)
/\ request_count \in 1..4
/\ replication_requests \in RequestSet(request_count)
/\ expected_doc = FinalDoc(replication_requests)
/\ versionMap_needsSafeAccess = FALSE
/\ versionMap_isUnsafe = FALSE
/\ versionMap_entry = NULL
(* Process LuceneProcess *)
/\ lucene_document = NULL
/\ lucene_buffer = <<>>
(* Process LocalCheckpointTrackerProcess *)
/\ localCheckPoint = 0
/\ completedSeqnos = {}
(* Process UnsafeSeqnoIncreaserProcess *)
/\ maxSeqNoOfNonAppendOnlyOperations = 0
(* Process ConsumerProcess *)
/\ duplicationCount = 0
/\ maxUnsafeAutoIdTimestamp \in {0, DocAutoIdTimestamp - 1, DocAutoIdTimestamp, DocAutoIdTimestamp + 1}
/\ req = defaultInitValue
/\ plan = defaultInitValue
/\ deleteFromLucene = defaultInitValue
/\ currentlyDeleted = defaultInitValue
/\ currentNotFoundOrDeleted = defaultInitValue
/\ useLuceneUpdateDocument = defaultInitValue
/\ indexIntoLucene = defaultInitValue
/\ pc = [self \in ProcSet |-> CASE self = "SafeAccessEnabler" -> "SafeAccessEnablerLoop"
[] self = "UnsafePutter" -> "UnsafePutterLoop"
[] self = "MaxUnsafeAutoIdTimestampIncreaser" -> "MaxUnsafeAutoIdTimestampIncreaserLoop"
[] self = "ReplicaLucene" -> "LuceneLoop"
[] self = "DeleteCollector" -> "DeleteCollectorLoop"
[] self = "LocalCheckpointTracker" -> "LocalCheckpointTrackerLoop"
[] self = "UnsafeSeqnoIncreaserProcess" -> "UnsafeSeqnoIncreaserProcessLoop"
[] self = "Consumer" -> "ConsumerLoop"]
SafeAccessEnablerLoop == /\ pc["SafeAccessEnabler"] = "SafeAccessEnablerLoop"
/\ IF pc["Consumer"] /= "Done"
THEN /\ versionMap_needsSafeAccess' = (versionMap_needsSafeAccess = FALSE)
/\ pc' = [pc EXCEPT !["SafeAccessEnabler"] = "SafeAccessEnablerLoop"]
ELSE /\ pc' = [pc EXCEPT !["SafeAccessEnabler"] = "Done"]
/\ UNCHANGED versionMap_needsSafeAccess
/\ UNCHANGED << request_count, replication_requests,
expected_doc, versionMap_isUnsafe,
versionMap_entry, lucene_document,
lucene_buffer, localCheckPoint,
completedSeqnos,
maxSeqNoOfNonAppendOnlyOperations,
duplicationCount,
maxUnsafeAutoIdTimestamp, req, plan,
deleteFromLucene, currentlyDeleted,
currentNotFoundOrDeleted,
useLuceneUpdateDocument,
indexIntoLucene >>
SafeAccessEnablerProcess == SafeAccessEnablerLoop
UnsafePutterLoop == /\ pc["UnsafePutter"] = "UnsafePutterLoop"
/\ IF pc["Consumer"] /= "Done"
THEN /\ versionMap_needsSafeAccess = FALSE
/\ versionMap_isUnsafe' = TRUE
/\ pc' = [pc EXCEPT !["UnsafePutter"] = "UnsafePutterLoop"]
ELSE /\ pc' = [pc EXCEPT !["UnsafePutter"] = "Done"]
/\ UNCHANGED versionMap_isUnsafe
/\ UNCHANGED << request_count, replication_requests,
expected_doc, versionMap_needsSafeAccess,
versionMap_entry, lucene_document,
lucene_buffer, localCheckPoint,
completedSeqnos,
maxSeqNoOfNonAppendOnlyOperations,
duplicationCount, maxUnsafeAutoIdTimestamp,
req, plan, deleteFromLucene,
currentlyDeleted, currentNotFoundOrDeleted,
useLuceneUpdateDocument, indexIntoLucene >>
UnsafePutterProcess == UnsafePutterLoop
MaxUnsafeAutoIdTimestampIncreaserLoop == /\ pc["MaxUnsafeAutoIdTimestampIncreaser"] = "MaxUnsafeAutoIdTimestampIncreaserLoop"
/\ IF pc["Consumer"] /= "Done"
THEN /\ \E newTimestamp \in {DocAutoIdTimestamp - 1, DocAutoIdTimestamp, DocAutoIdTimestamp + 1}:
/\ maxUnsafeAutoIdTimestamp < newTimestamp
/\ maxUnsafeAutoIdTimestamp' = newTimestamp
/\ pc' = [pc EXCEPT !["MaxUnsafeAutoIdTimestampIncreaser"] = "MaxUnsafeAutoIdTimestampIncreaserLoop"]
ELSE /\ pc' = [pc EXCEPT !["MaxUnsafeAutoIdTimestampIncreaser"] = "Done"]
/\ UNCHANGED maxUnsafeAutoIdTimestamp
/\ UNCHANGED << request_count,
replication_requests,
expected_doc,
versionMap_needsSafeAccess,
versionMap_isUnsafe,
versionMap_entry,
lucene_document,
lucene_buffer,
localCheckPoint,
completedSeqnos,
maxSeqNoOfNonAppendOnlyOperations,
duplicationCount, req,
plan,
deleteFromLucene,
currentlyDeleted,
currentNotFoundOrDeleted,
useLuceneUpdateDocument,
indexIntoLucene >>
MaxUnsafeAutoIdTimestampIncreaserProcess == MaxUnsafeAutoIdTimestampIncreaserLoop
LuceneLoop == /\ pc["ReplicaLucene"] = "LuceneLoop"
/\ IF pc["Consumer"] /= "Done" \/ lucene_buffer /= <<>>
THEN /\ lucene_document' = ApplyBufferedOperations(lucene_buffer, lucene_document)
/\ lucene_buffer' = <<>>
/\ versionMap_isUnsafe' = FALSE
/\ versionMap_needsSafeAccess' = FALSE
/\ IF versionMap_entry /= NULL
THEN /\ IF versionMap_entry.type = UPDATE
THEN /\ versionMap_entry' = NULL
ELSE /\ Assert(versionMap_entry.type = DELETE,
"Failure of assertion at line 147, column 17.")
/\ versionMap_entry' = [ versionMap_entry EXCEPT !.flushed = TRUE ]
ELSE /\ TRUE
/\ UNCHANGED versionMap_entry
/\ pc' = [pc EXCEPT !["ReplicaLucene"] = "LuceneLoop"]
ELSE /\ pc' = [pc EXCEPT !["ReplicaLucene"] = "Done"]
/\ UNCHANGED << versionMap_needsSafeAccess,
versionMap_isUnsafe, versionMap_entry,
lucene_document, lucene_buffer >>
/\ UNCHANGED << request_count, replication_requests,
expected_doc, localCheckPoint, completedSeqnos,
maxSeqNoOfNonAppendOnlyOperations,
duplicationCount, maxUnsafeAutoIdTimestamp, req,
plan, deleteFromLucene, currentlyDeleted,
currentNotFoundOrDeleted,
useLuceneUpdateDocument, indexIntoLucene >>
LuceneProcess == LuceneLoop
DeleteCollectorLoop == /\ pc["DeleteCollector"] = "DeleteCollectorLoop"
/\ IF pc["Consumer"] /= "Done"
THEN /\ /\ versionMap_entry /= NULL
/\ versionMap_entry.type = DELETE
/\ versionMap_entry.seqno <= localCheckPoint
/\ versionMap_entry.flushed = TRUE
/\ versionMap_entry' = NULL
/\ pc' = [pc EXCEPT !["DeleteCollector"] = "DeleteCollectorLoop"]
ELSE /\ pc' = [pc EXCEPT !["DeleteCollector"] = "Done"]
/\ UNCHANGED versionMap_entry
/\ UNCHANGED << request_count, replication_requests,
expected_doc,
versionMap_needsSafeAccess,
versionMap_isUnsafe, lucene_document,
lucene_buffer, localCheckPoint,
completedSeqnos,
maxSeqNoOfNonAppendOnlyOperations,
duplicationCount,
maxUnsafeAutoIdTimestamp, req, plan,
deleteFromLucene, currentlyDeleted,
currentNotFoundOrDeleted,
useLuceneUpdateDocument,
indexIntoLucene >>
DeleteCollectorProcess == DeleteCollectorLoop
LocalCheckpointTrackerLoop == /\ pc["LocalCheckpointTracker"] = "LocalCheckpointTrackerLoop"
/\ IF pc["Consumer"] /= "Done"
THEN /\ localCheckPoint + 1 \in completedSeqnos
/\ localCheckPoint' = localCheckPoint + 1
/\ pc' = [pc EXCEPT !["LocalCheckpointTracker"] = "LocalCheckpointTrackerLoop"]
ELSE /\ pc' = [pc EXCEPT !["LocalCheckpointTracker"] = "Done"]
/\ UNCHANGED localCheckPoint
/\ UNCHANGED << request_count,
replication_requests,
expected_doc,
versionMap_needsSafeAccess,
versionMap_isUnsafe,
versionMap_entry,
lucene_document, lucene_buffer,
completedSeqnos,
maxSeqNoOfNonAppendOnlyOperations,
duplicationCount,
maxUnsafeAutoIdTimestamp, req,
plan, deleteFromLucene,
currentlyDeleted,
currentNotFoundOrDeleted,
useLuceneUpdateDocument,
indexIntoLucene >>
LocalCheckpointTrackerProcess == LocalCheckpointTrackerLoop
UnsafeSeqnoIncreaserProcessLoop == /\ pc["UnsafeSeqnoIncreaserProcess"] = "UnsafeSeqnoIncreaserProcessLoop"
/\ IF pc["Consumer"] /= "Done" /\ maxSeqNoOfNonAppendOnlyOperations < request_count + 1
THEN /\ maxSeqNoOfNonAppendOnlyOperations' = maxSeqNoOfNonAppendOnlyOperations + 1
/\ pc' = [pc EXCEPT !["UnsafeSeqnoIncreaserProcess"] = "UnsafeSeqnoIncreaserProcessLoop"]
ELSE /\ pc' = [pc EXCEPT !["UnsafeSeqnoIncreaserProcess"] = "Done"]
/\ UNCHANGED maxSeqNoOfNonAppendOnlyOperations
/\ UNCHANGED << request_count,
replication_requests,
expected_doc,
versionMap_needsSafeAccess,
versionMap_isUnsafe,
versionMap_entry,
lucene_document,
lucene_buffer,
localCheckPoint,
completedSeqnos,
duplicationCount,
maxUnsafeAutoIdTimestamp,
req, plan, deleteFromLucene,
currentlyDeleted,
currentNotFoundOrDeleted,
useLuceneUpdateDocument,
indexIntoLucene >>
UnsafeSeqnoIncreaserProcess == UnsafeSeqnoIncreaserProcessLoop
ConsumerLoop == /\ pc["Consumer"] = "ConsumerLoop"
/\ IF replication_requests /= {}
THEN /\ \E replication_request \in replication_requests:
IF replication_request.type = ADD
THEN /\ \/ /\ replication_requests' = replication_requests \ {replication_request}
/\ req' = replication_request
/\ UNCHANGED duplicationCount
\/ /\ duplicationCount < DuplicationLimit
/\ duplicationCount' = duplicationCount + 1
/\ replication_requests' = (replication_requests \ {replication_request})
\cup {[replication_request EXCEPT !.type = RETRY_ADD]}
/\ req' = replication_request
\/ /\ duplicationCount < DuplicationLimit
/\ duplicationCount' = duplicationCount + 1
/\ req' = [replication_request EXCEPT !.type = RETRY_ADD]
/\ UNCHANGED replication_requests
ELSE /\ req' = replication_request
/\ \/ /\ duplicationCount < DuplicationLimit
/\ duplicationCount' = duplicationCount + 1
/\ UNCHANGED replication_requests
\/ /\ replication_requests' = replication_requests \ {replication_request}
/\ UNCHANGED duplicationCount
/\ IF req'.type = DELETE
THEN /\ versionMap_needsSafeAccess' = TRUE
/\ maxSeqNoOfNonAppendOnlyOperations' = Max(maxSeqNoOfNonAppendOnlyOperations, req'.seqno)
/\ IF req'.seqno <= localCheckPoint
THEN /\ plan' = "processButSkipLucene"
/\ deleteFromLucene' = FALSE
/\ currentlyDeleted' = FALSE
/\ pc' = [pc EXCEPT !["Consumer"] = "ExecuteDeletePlan"]
ELSE /\ IF versionMap_isUnsafe
THEN /\ pc' = [pc EXCEPT !["Consumer"] = "AwaitRefreshOnDelete"]
ELSE /\ pc' = [pc EXCEPT !["Consumer"] = "compareDeleteOpToLuceneDocBasedOnSeqNo"]
/\ UNCHANGED << plan,
deleteFromLucene,
currentlyDeleted >>
/\ UNCHANGED << maxUnsafeAutoIdTimestamp,
currentNotFoundOrDeleted,
useLuceneUpdateDocument,
indexIntoLucene >>
ELSE /\ IF req'.type = RETRY_ADD
THEN /\ maxUnsafeAutoIdTimestamp' = Max(maxUnsafeAutoIdTimestamp, req'.autoIdTimeStamp)
ELSE /\ TRUE
/\ UNCHANGED maxUnsafeAutoIdTimestamp
/\ IF /\ req'.type = ADD
/\ maxUnsafeAutoIdTimestamp' < req'.autoIdTimeStamp
/\ maxSeqNoOfNonAppendOnlyOperations < req'.seqno
THEN /\ plan' = "optimisedAppendOnly"
/\ currentNotFoundOrDeleted' = TRUE
/\ useLuceneUpdateDocument' = FALSE
/\ indexIntoLucene' = TRUE
/\ pc' = [pc EXCEPT !["Consumer"] = "ExecuteIndexPlan"]
/\ UNCHANGED << versionMap_needsSafeAccess,
maxSeqNoOfNonAppendOnlyOperations >>
ELSE /\ IF req'.type \notin {ADD, RETRY_ADD}
THEN /\ maxSeqNoOfNonAppendOnlyOperations' = Max(maxSeqNoOfNonAppendOnlyOperations, req'.seqno)
ELSE /\ TRUE
/\ UNCHANGED maxSeqNoOfNonAppendOnlyOperations
/\ versionMap_needsSafeAccess' = TRUE
/\ IF req'.seqno <= localCheckPoint
THEN /\ plan' = "processButSkipLucene"
/\ currentNotFoundOrDeleted' = FALSE
/\ useLuceneUpdateDocument' = FALSE
/\ indexIntoLucene' = FALSE
/\ pc' = [pc EXCEPT !["Consumer"] = "ExecuteIndexPlan"]
ELSE /\ IF versionMap_isUnsafe
THEN /\ pc' = [pc EXCEPT !["Consumer"] = "AwaitRefreshOnIndex"]
ELSE /\ pc' = [pc EXCEPT !["Consumer"] = "compareIndexOpToLuceneDocBasedOnSeqNo"]
/\ UNCHANGED << plan,
currentNotFoundOrDeleted,
useLuceneUpdateDocument,
indexIntoLucene >>
/\ UNCHANGED << deleteFromLucene,
currentlyDeleted >>
ELSE /\ pc' = [pc EXCEPT !["Consumer"] = "Done"]
/\ UNCHANGED << replication_requests,
versionMap_needsSafeAccess,
maxSeqNoOfNonAppendOnlyOperations,
duplicationCount,
maxUnsafeAutoIdTimestamp, req, plan,
deleteFromLucene, currentlyDeleted,
currentNotFoundOrDeleted,
useLuceneUpdateDocument,
indexIntoLucene >>
/\ UNCHANGED << request_count, expected_doc,
versionMap_isUnsafe, versionMap_entry,
lucene_document, lucene_buffer,
localCheckPoint, completedSeqnos >>
ExecuteDeletePlan == /\ pc["Consumer"] = "ExecuteDeletePlan"
/\ IF deleteFromLucene
THEN /\ IF currentlyDeleted = FALSE
THEN /\ lucene_buffer' = Append(lucene_buffer, [ type |-> Lucene_deleteDocuments ])
ELSE /\ TRUE
/\ UNCHANGED lucene_buffer
/\ versionMap_entry' = [ type |-> DELETE, seqno |-> req.seqno, flushed |-> FALSE ]
ELSE /\ TRUE
/\ UNCHANGED << versionMap_entry,
lucene_buffer >>
/\ completedSeqnos' = (completedSeqnos \cup {req.seqno})
/\ pc' = [pc EXCEPT !["Consumer"] = "ConsumerLoop"]
/\ UNCHANGED << request_count, replication_requests,
expected_doc, versionMap_needsSafeAccess,
versionMap_isUnsafe, lucene_document,
localCheckPoint,
maxSeqNoOfNonAppendOnlyOperations,
duplicationCount,
maxUnsafeAutoIdTimestamp, req, plan,
deleteFromLucene, currentlyDeleted,
currentNotFoundOrDeleted,
useLuceneUpdateDocument, indexIntoLucene >>
ExecuteIndexPlan == /\ pc["Consumer"] = "ExecuteIndexPlan"
/\ IF indexIntoLucene
THEN /\ lucene_buffer' = Append(lucene_buffer,
[ type |-> IF useLuceneUpdateDocument THEN Lucene_updateDocuments ELSE Lucene_addDocuments
, seqno |-> req.seqno
, content |-> req.content
])
/\ IF versionMap_needsSafeAccess
THEN /\ versionMap_entry' = [ type |-> UPDATE, seqno |-> req.seqno ]
/\ UNCHANGED versionMap_isUnsafe
ELSE /\ versionMap_isUnsafe' = TRUE
/\ IF /\ versionMap_entry /= NULL
/\ versionMap_entry.type = DELETE
/\ versionMap_entry.seqno < req.seqno
THEN /\ versionMap_entry' = NULL
ELSE /\ TRUE
/\ UNCHANGED versionMap_entry
ELSE /\ TRUE
/\ UNCHANGED << versionMap_isUnsafe,
versionMap_entry, lucene_buffer >>
/\ completedSeqnos' = (completedSeqnos \cup {req.seqno})
/\ pc' = [pc EXCEPT !["Consumer"] = "ConsumerLoop"]
/\ UNCHANGED << request_count, replication_requests,
expected_doc, versionMap_needsSafeAccess,
lucene_document, localCheckPoint,
maxSeqNoOfNonAppendOnlyOperations,
duplicationCount, maxUnsafeAutoIdTimestamp,
req, plan, deleteFromLucene,
currentlyDeleted, currentNotFoundOrDeleted,
useLuceneUpdateDocument, indexIntoLucene >>
compareDeleteOpToLuceneDocBasedOnSeqNo == /\ pc["Consumer"] = "compareDeleteOpToLuceneDocBasedOnSeqNo"
/\ IF versionMap_entry /= NULL
THEN /\ IF req.seqno > versionMap_entry.seqno
THEN /\ plan' = "processNormally"
/\ deleteFromLucene' = TRUE
/\ currentlyDeleted' = FALSE
ELSE /\ plan' = "processButSkipLucene"
/\ deleteFromLucene' = FALSE
/\ currentlyDeleted' = FALSE
ELSE /\ IF lucene_document = NULL
THEN /\ plan' = "processNormallyExceptNotFound"
/\ deleteFromLucene' = TRUE
/\ currentlyDeleted' = TRUE
ELSE /\ IF req.seqno > lucene_document.seqno
THEN /\ plan' = "processNormally"
/\ deleteFromLucene' = TRUE
/\ currentlyDeleted' = FALSE
ELSE /\ plan' = "processButSkipLucene"
/\ deleteFromLucene' = FALSE
/\ currentlyDeleted' = FALSE
/\ pc' = [pc EXCEPT !["Consumer"] = "ExecuteDeletePlan"]
/\ UNCHANGED << request_count,
replication_requests,
expected_doc,
versionMap_needsSafeAccess,
versionMap_isUnsafe,
versionMap_entry,
lucene_document,
lucene_buffer,
localCheckPoint,
completedSeqnos,
maxSeqNoOfNonAppendOnlyOperations,
duplicationCount,
maxUnsafeAutoIdTimestamp,
req,
currentNotFoundOrDeleted,
useLuceneUpdateDocument,
indexIntoLucene >>
AwaitRefreshOnDelete == /\ pc["Consumer"] = "AwaitRefreshOnDelete"
/\ lucene_buffer = <<>>
/\ versionMap_needsSafeAccess' = TRUE
/\ pc' = [pc EXCEPT !["Consumer"] = "compareDeleteOpToLuceneDocBasedOnSeqNo"]
/\ UNCHANGED << request_count, replication_requests,
expected_doc, versionMap_isUnsafe,
versionMap_entry, lucene_document,
lucene_buffer, localCheckPoint,
completedSeqnos,
maxSeqNoOfNonAppendOnlyOperations,
duplicationCount,
maxUnsafeAutoIdTimestamp, req, plan,
deleteFromLucene, currentlyDeleted,
currentNotFoundOrDeleted,
useLuceneUpdateDocument,
indexIntoLucene >>
compareIndexOpToLuceneDocBasedOnSeqNo == /\ pc["Consumer"] = "compareIndexOpToLuceneDocBasedOnSeqNo"
/\ IF req.seqno <= localCheckPoint
THEN /\ plan' = "processButSkipLucene"
/\ currentNotFoundOrDeleted' = FALSE
/\ useLuceneUpdateDocument' = FALSE
/\ indexIntoLucene' = FALSE
ELSE /\ IF versionMap_entry /= NULL
THEN /\ IF req.seqno > versionMap_entry.seqno
THEN /\ plan' = "processNormally"
/\ currentNotFoundOrDeleted' = FALSE
/\ useLuceneUpdateDocument' = TRUE
/\ indexIntoLucene' = TRUE
ELSE /\ plan' = "processButSkipLucene"
/\ currentNotFoundOrDeleted' = FALSE
/\ useLuceneUpdateDocument' = FALSE
/\ indexIntoLucene' = FALSE
ELSE /\ IF lucene_document = NULL
THEN /\ plan' = "processNormallyExceptNotFound"
/\ currentNotFoundOrDeleted' = TRUE
/\ useLuceneUpdateDocument' = FALSE
/\ indexIntoLucene' = TRUE
ELSE /\ IF req.seqno > lucene_document.seqno
THEN /\ plan' = "processNormally"
/\ currentNotFoundOrDeleted' = FALSE
/\ useLuceneUpdateDocument' = TRUE
/\ indexIntoLucene' = TRUE
ELSE /\ plan' = "processButSkipLucene"
/\ currentNotFoundOrDeleted' = FALSE
/\ useLuceneUpdateDocument' = FALSE
/\ indexIntoLucene' = FALSE
/\ pc' = [pc EXCEPT !["Consumer"] = "ExecuteIndexPlan"]
/\ UNCHANGED << request_count,
replication_requests,
expected_doc,
versionMap_needsSafeAccess,
versionMap_isUnsafe,
versionMap_entry,
lucene_document,
lucene_buffer,
localCheckPoint,
completedSeqnos,
maxSeqNoOfNonAppendOnlyOperations,
duplicationCount,
maxUnsafeAutoIdTimestamp,
req, deleteFromLucene,
currentlyDeleted >>
AwaitRefreshOnIndex == /\ pc["Consumer"] = "AwaitRefreshOnIndex"
/\ lucene_buffer = <<>>
/\ versionMap_needsSafeAccess' = TRUE
/\ pc' = [pc EXCEPT !["Consumer"] = "compareIndexOpToLuceneDocBasedOnSeqNo"]
/\ UNCHANGED << request_count, replication_requests,
expected_doc, versionMap_isUnsafe,
versionMap_entry, lucene_document,
lucene_buffer, localCheckPoint,
completedSeqnos,
maxSeqNoOfNonAppendOnlyOperations,
duplicationCount,
maxUnsafeAutoIdTimestamp, req, plan,
deleteFromLucene, currentlyDeleted,
currentNotFoundOrDeleted,
useLuceneUpdateDocument,
indexIntoLucene >>
ConsumerProcess == ConsumerLoop \/ ExecuteDeletePlan \/ ExecuteIndexPlan
\/ compareDeleteOpToLuceneDocBasedOnSeqNo
\/ AwaitRefreshOnDelete
\/ compareIndexOpToLuceneDocBasedOnSeqNo
\/ AwaitRefreshOnIndex
Next == SafeAccessEnablerProcess \/ UnsafePutterProcess
\/ MaxUnsafeAutoIdTimestampIncreaserProcess \/ LuceneProcess
\/ DeleteCollectorProcess \/ LocalCheckpointTrackerProcess
\/ UnsafeSeqnoIncreaserProcess \/ ConsumerProcess
\/ (* Disjunct to prevent deadlock on termination *)
((\A self \in ProcSet: pc[self] = "Done") /\ UNCHANGED vars)
Spec == Init /\ [][Next]_vars
Termination == <>(\A self \in ProcSet: pc[self] = "Done")
\* END TRANSLATION
Terminated == \A self \in ProcSet: pc[self] = "Done"
Invariant == Terminated => /\ expected_doc = NULL => lucene_document = NULL
/\ expected_doc /= NULL => lucene_document.content = expected_doc
=============================================================================
================================================
FILE: ReplicaEngine/tla/ReplicaEngine.toolbox/.project
================================================
ReplicaEnginetoolbox.builder.TLAParserBuildertoolbox.builder.PCalAlgorithmSearchingBuildertoolbox.natures.TLANatureReplicaEngine.tla1PARENT-1-PROJECT_LOC/ReplicaEngine.tla
================================================
FILE: ReplicaEngine/tla/ReplicaEngine.toolbox/.settings/org.lamport.tla.toolbox.prefs
================================================
ProjectRootFile=PARENT-1-PROJECT_LOC/ReplicaEngine.tla
eclipse.preferences.version=1
================================================
FILE: ReplicaEngine/tla/ReplicaEngine.toolbox/ReplicaEngine___model.launch
================================================
================================================
FILE: Storage/tla/Storage.tla
================================================
------------------------------ MODULE Storage ------------------------------
EXTENDS Integers, FiniteSets, TLC
CONSTANTS
MaxNewMeta, \* maximum generation of newMeta to limit the state space
MetaDataContent \* content that is written to the metadata file
VARIABLES
metadata, \* metaData[i] = MetaDataContent if metadata of generation i is present
manifest, \* manifest[j] is generation of metadata j-th manifest is referencing
newMeta, \* generation of newly created metadata file
newManifest, \* generation of newly created manifest file
state, \* current state, describes what to do next
possibleStates \* set of generations of metadata that limits what can be read from disk
--------------------------------------------------------------------------
(*************************************************************************)
(* First we define some helper functions to work with files abstraction. *)
(* Files is a function from file generation to some content. *)
(*************************************************************************)
(*************************************************************************)
(* CurrentGeneration returns the maximum file generation. If there are *)
(* no files then -1 is returned. *)
(*************************************************************************)
CurrentGeneration(files) ==
IF DOMAIN files = {}
THEN -1
ELSE
CHOOSE gen \in DOMAIN files :
\A otherGen \in DOMAIN files : gen \geq otherGen
(*************************************************************************)
(* DeleteFile removes file with generation delGen. *)
(*************************************************************************)
DeleteFile(files, delGen) == [gen \in DOMAIN files \ {delGen} |-> files[gen]]
(*************************************************************************)
(* DeleteFilesExcept removes all files except keepGen. *)
(*************************************************************************)
DeleteFilesExcept(files, keepGen) == (keepGen :> files[keepGen])
(*************************************************************************)
(* WriteFile creates new file with specified generation and content. *)
(*************************************************************************)
WriteFile(files, gen, content) == (gen :> content) @@ files
--------------------------------------------------------------------------
(*************************************************************************)
(* Now we define functions to emulate write and cleanup of the metadata. *)
(*************************************************************************)
WriteMetaOk(gen) ==
/\ metadata' = WriteFile(metadata, gen, MetaDataContent)
/\ state' = "writeManifest"
WriteMetaFail(gen) ==
/\ metadata' = metadata
/\ state' = "writeMeta"
WriteMetaDirty(gen) ==
/\ \/ metadata' = WriteFile(metadata, gen, MetaDataContent)
\/ metadata' = metadata
/\ state' = "deleteNewMeta"
DeleteNewMeta ==
/\ \/ metadata' = DeleteFile(metadata, newMeta)
\/ metadata' = metadata
/\ state' = "writeMeta"
/\ UNCHANGED <>
DeleteOldMeta ==
/\ \/ metadata' = DeleteFilesExcept(metadata, newMeta)
\/ metadata' = metadata
/\ state' = "writeMeta"
/\ UNCHANGED <>
WriteMeta ==
LET gen == CurrentGeneration(metadata) + 1 IN
/\ newMeta' = gen
/\ \/ WriteMetaOk(gen)
\/ WriteMetaFail(gen)
\/ WriteMetaDirty(gen)
/\ UNCHANGED <>
--------------------------------------------------------------------------
(*************************************************************************)
(* Now we define functions to emulate write and cleanup of the manifest *)
(* file. *)
(*************************************************************************)
WriteManifestOk(gen) ==
/\ manifest' = WriteFile(manifest, gen, newMeta)
/\ state' = "deleteOldManifest"
/\ possibleStates' = {newMeta}
WriteManifestFail(gen) ==
/\ manifest' = manifest
/\ state' = "deleteNewMeta"
/\ possibleStates' = possibleStates
WriteManifestDirty(gen) ==
/\ \/ manifest' = WriteFile(manifest, gen, newMeta)
\/ manifest' = manifest
/\ state' = "deleteNewManifest"
/\ possibleStates' = possibleStates \union {newMeta}
WriteManifest ==
LET gen == CurrentGeneration(manifest) + 1 IN
/\ newManifest' = gen
/\ \/ WriteManifestOk(gen)
\/ WriteManifestFail(gen)
\/ WriteManifestDirty(gen)
/\ UNCHANGED <>
DeleteOldManifest ==
/\ \/ manifest' = DeleteFilesExcept(manifest, newManifest)
\/ manifest' = manifest
/\ state' = "deleteOldMeta"
/\ UNCHANGED <>
--------------------------------------------------------------------------
(*************************************************************************)
(* Below are 3 versions of the same function, that is called when *)
(* manifest write was dirty. The buggy one was initially implemented and *)
(* was caught by https://github.com/elastic/elasticsearch/issues/39077. *)
(* Pick one and use in Next function. *)
(* https://github.com/elastic/elasticsearch/pull/40519 implements *)
(* DeleteNewManifestEasy. *)
(*************************************************************************)
DeleteNewManifestBuggy ==
/\ \/ manifest' = DeleteFile(manifest, newManifest)
\/ manifest' = manifest
/\ state' = "deleteNewMeta"
/\ UNCHANGED <>
DeleteNewManifestEasy ==
/\ \/ manifest' = DeleteFile(manifest, newManifest)
\/ manifest' = manifest
/\ state' = "writeMeta"
/\ UNCHANGED <>
DeleteNewManifestHard ==
/\ \/ /\ manifest' = DeleteFile(manifest, newManifest)
/\ state' = "deleteNewMeta"
\/ /\ manifest' = manifest
/\ state' = "writeMeta"
/\ UNCHANGED <>
--------------------------------------------------------------------------
(*************************************************************************)
(* We can define Init and Next functions now. *)
(*************************************************************************)
Init ==
/\ metadata = <<>>
/\ manifest = <<>>
/\ newMeta = -1 \* no latest metadata file
/\ newManifest = -1 \* no latest manifest file
/\ state = "writeMeta" \* we start with writing metadata file
/\ possibleStates = {} \* no metadata can be read from disk
Next ==
\/ (state = "writeMeta" /\ WriteMeta)
\/ (state = "writeManifest" /\ WriteManifest)
\/ (state = "deleteOldManifest" /\ DeleteOldManifest)
\/ (state = "deleteOldMeta" /\ DeleteOldMeta)
\/ (state = "deleteNewManifest" /\ DeleteNewManifestEasy) \* try DeleteNewManifestBuggy and DeleteNewManifestHard
\/ (state = "deleteNewMeta" /\ DeleteNewMeta)
--------------------------------------------------------------------------
(*************************************************************************)
(* Our model has 2 invariants. *)
(*************************************************************************)
MetadataFileReferencedByManifestExists ==
CurrentGeneration(manifest) /= -1
=>
manifest[CurrentGeneration(manifest)] \in DOMAIN metadata
MetadataReferencedByManifestIsValid ==
CurrentGeneration(manifest) /= -1
=>
\E meta \in possibleStates : meta = manifest[CurrentGeneration(manifest)]
============
================================================
FILE: Storage/tla/Storage.toolbox/Storage___model.launch
================================================
================================================
FILE: ZenWithTerms/tla/ZenWithTerms.tla
================================================
-------------------------------------------------------------------------------------
-------------------------------- MODULE ZenWithTerms --------------------------------
\* Imported modules used in this specification
EXTENDS Naturals, FiniteSets, Sequences, TLC
----
CONSTANTS Values
\* Set of node ids (all master-eligible nodes)
CONSTANTS Nodes
\* RPC message types
CONSTANTS
Join,
PublishRequest,
PublishResponse,
Commit
----
\* Set of requests and responses sent between nodes.
VARIABLE messages
\* Transitive closure of value updates as done by leaders
VARIABLE descendant
\* Values to bootstrap the cluster
VARIABLE initialConfiguration
VARIABLE initialValue
VARIABLE initialAcceptedVersion
\* node state (map from node id to state)
VARIABLE currentTerm
VARIABLE lastCommittedConfiguration
VARIABLE lastAcceptedTerm
VARIABLE lastAcceptedVersion
VARIABLE lastAcceptedValue
VARIABLE lastAcceptedConfiguration
VARIABLE joinVotes
VARIABLE startedJoinSinceLastReboot
VARIABLE electionWon
VARIABLE lastPublishedVersion
VARIABLE lastPublishedConfiguration
VARIABLE publishVotes
----
Terms == Nat
Versions == Nat
\* set of valid configurations (i.e. the set of all non-empty subsets of Nodes)
ValidConfigs == SUBSET(Nodes) \ {{}}
\* cluster-state versions that might have come from older systems
InitialVersions == Nat
\* quorums correspond to majority of votes in a config
IsQuorum(votes, config) == Cardinality(votes \cap config) * 2 > Cardinality(config)
IsElectionQuorum(n, votes) ==
/\ IsQuorum(votes, lastCommittedConfiguration[n])
/\ IsQuorum(votes, lastAcceptedConfiguration[n])
IsPublishQuorum(n, votes) ==
/\ IsQuorum(votes, lastCommittedConfiguration[n])
/\ IsQuorum(votes, lastPublishedConfiguration[n])
\* initial model state
Init == /\ messages = {}
/\ descendant = {}
/\ initialConfiguration \in ValidConfigs
/\ initialValue \in Values
/\ initialAcceptedVersion \in [Nodes -> InitialVersions]
/\ currentTerm = [n \in Nodes |-> 0]
/\ lastCommittedConfiguration = [n \in Nodes |-> {}] \* empty config
/\ lastAcceptedTerm = [n \in Nodes |-> 0]
/\ lastAcceptedVersion = initialAcceptedVersion
/\ lastAcceptedValue \in {[n \in Nodes |-> v] : v \in Values} \* all agree on initial value
/\ lastAcceptedConfiguration = [n \in Nodes |-> lastCommittedConfiguration[n]]
/\ joinVotes = [n \in Nodes |-> {}]
/\ startedJoinSinceLastReboot = [n \in Nodes |-> FALSE]
/\ electionWon = [n \in Nodes |-> FALSE]
/\ lastPublishedVersion = [n \in Nodes |-> 0]
/\ lastPublishedConfiguration = [n \in Nodes |-> lastCommittedConfiguration[n]]
/\ publishVotes = [n \in Nodes |-> {}]
\* Bootstrap node n with the initial state and config
SetInitialState(n) ==
/\ lastAcceptedConfiguration[n] = {} \* not already bootstrapped
/\ Assert(lastAcceptedTerm[n] = 0, "lastAcceptedTerm should be 0")
/\ Assert(lastCommittedConfiguration[n] = {}, "lastCommittedConfiguration should be empty")
/\ Assert(lastPublishedVersion[n] = 0, "lastPublishedVersion should be 0")
/\ Assert(lastPublishedConfiguration[n] = {}, "lastPublishedConfiguration should be empty")
/\ Assert(electionWon[n] = FALSE, "electionWon should be FALSE")
/\ Assert(joinVotes[n] = {}, "joinVotes should be empty")
/\ Assert(publishVotes[n] = {}, "publishVotes should be empty")
/\ lastAcceptedConfiguration' = [lastAcceptedConfiguration EXCEPT ![n] = initialConfiguration]
/\ lastAcceptedValue' = [lastAcceptedValue EXCEPT ![n] = initialValue]
/\ lastCommittedConfiguration' = [lastCommittedConfiguration EXCEPT ![n] = initialConfiguration]
/\ Assert(lastAcceptedTerm[n] = 0, "lastAcceptedTerm should be 0")
/\ Assert(lastAcceptedConfiguration'[n] /= {}, "lastAcceptedConfiguration should be non-empty")
/\ Assert(lastCommittedConfiguration'[n] /= {}, "lastCommittedConfiguration should be non-empty")
/\ UNCHANGED <>
\* Send join request from node n to node nm for term t
HandleStartJoin(n, nm, t) ==
/\ t > currentTerm[n]
/\ LET
joinRequest == [method |-> Join,
source |-> n,
dest |-> nm,
term |-> t,
laTerm |-> lastAcceptedTerm[n],
laVersion |-> lastAcceptedVersion[n]]
IN
/\ currentTerm' = [currentTerm EXCEPT ![n] = t]
/\ lastPublishedVersion' = [lastPublishedVersion EXCEPT ![n] = 0]
/\ lastPublishedConfiguration' = [lastPublishedConfiguration EXCEPT ![n] = lastAcceptedConfiguration[n]]
/\ startedJoinSinceLastReboot' = [startedJoinSinceLastReboot EXCEPT ![n] = TRUE]
/\ electionWon' = [electionWon EXCEPT ![n] = FALSE]
/\ joinVotes' = [joinVotes EXCEPT ![n] = {}]
/\ publishVotes' = [publishVotes EXCEPT ![n] = {}]
/\ messages' = messages \cup { joinRequest }
/\ UNCHANGED <>
\* node n handles a join request and checks if it has received enough joins (= votes)
\* for its term to be elected as master
HandleJoin(n, m) ==
/\ m.method = Join
/\ m.term = currentTerm[n]
/\ startedJoinSinceLastReboot[n]
/\ \/ m.laTerm < lastAcceptedTerm[n]
\/ /\ m.laTerm = lastAcceptedTerm[n]
/\ m.laVersion <= lastAcceptedVersion[n]
/\ lastAcceptedConfiguration[n] /= {} \* must be bootstrapped
/\ joinVotes' = [joinVotes EXCEPT ![n] = @ \cup { m.source }]
/\ electionWon' = [electionWon EXCEPT ![n] = IsElectionQuorum(n, joinVotes'[n])]
/\ IF electionWon[n] = FALSE /\ electionWon'[n]
THEN
\* initiating publish version with last accepted version to enable client requests
/\ lastPublishedVersion' = [lastPublishedVersion EXCEPT ![n] = lastAcceptedVersion[n]]
ELSE
UNCHANGED <>
/\ UNCHANGED <>
\* client causes a cluster state change val with configuration cfg
HandleClientValue(n, t, v, val, cfg) ==
/\ electionWon[n]
/\ lastPublishedVersion[n] = lastAcceptedVersion[n] \* means we have the last published value / config (useful for CAS operations, where we need to read the previous value first)
/\ t = currentTerm[n]
/\ v > lastPublishedVersion[n]
/\ cfg /= lastAcceptedConfiguration[n] => lastCommittedConfiguration[n] = lastAcceptedConfiguration[n] \* only allow reconfiguration if there is not already a reconfiguration in progress
/\ IsQuorum(joinVotes[n], cfg) \* only allow reconfiguration if we have a quorum of (join) votes for the new config
/\ LET
publishRequests == { [method |-> PublishRequest,
source |-> n,
dest |-> ns,
term |-> t,
version |-> v,
value |-> val,
config |-> cfg,
commConf |-> lastCommittedConfiguration[n]] : ns \in Nodes }
newEntry == [prevT |-> lastAcceptedTerm[n],
prevV |-> lastAcceptedVersion[n],
nextT |-> t,
nextV |-> v]
matchingElems == { e \in descendant :
/\ e.nextT = newEntry.prevT
/\ e.nextV = newEntry.prevV }
newTransitiveElems == { [prevT |-> e.prevT,
prevV |-> e.prevV,
nextT |-> newEntry.nextT,
nextV |-> newEntry.nextV] : e \in matchingElems }
IN
/\ descendant' = descendant \cup {newEntry} \cup newTransitiveElems
/\ lastPublishedVersion' = [lastPublishedVersion EXCEPT ![n] = v]
/\ lastPublishedConfiguration' = [lastPublishedConfiguration EXCEPT ![n] = cfg]
/\ publishVotes' = [publishVotes EXCEPT ![n] = {}] \* publishVotes are only counted per publish version
/\ messages' = messages \cup publishRequests
/\ UNCHANGED <>
\* handle publish request m on node n
HandlePublishRequest(n, m) ==
/\ m.method = PublishRequest
/\ m.term = currentTerm[n]
/\ (m.term = lastAcceptedTerm[n]) => (m.version > lastAcceptedVersion[n])
/\ lastAcceptedTerm' = [lastAcceptedTerm EXCEPT ![n] = m.term]
/\ lastAcceptedVersion' = [lastAcceptedVersion EXCEPT ![n] = m.version]
/\ lastAcceptedValue' = [lastAcceptedValue EXCEPT ![n] = m.value]
/\ lastAcceptedConfiguration' = [lastAcceptedConfiguration EXCEPT ![n] = m.config]
/\ lastCommittedConfiguration' = [lastCommittedConfiguration EXCEPT ![n] = m.commConf]
/\ LET
response == [method |-> PublishResponse,
source |-> n,
dest |-> m.source,
term |-> m.term,
version |-> m.version]
IN
/\ messages' = messages \cup {response}
/\ UNCHANGED <>
\* node n commits a change
HandlePublishResponse(n, m) ==
/\ m.method = PublishResponse
/\ electionWon[n]
/\ m.term = currentTerm[n]
/\ m.version = lastPublishedVersion[n]
/\ publishVotes' = [publishVotes EXCEPT ![n] = @ \cup {m.source}]
/\ IF
IsPublishQuorum(n, publishVotes'[n])
THEN
LET
commitRequests == { [method |-> Commit,
source |-> n,
dest |-> ns,
term |-> currentTerm[n],
version |-> lastPublishedVersion[n]] : ns \in Nodes }
IN
/\ messages' = messages \cup commitRequests
ELSE
UNCHANGED <>
/\ UNCHANGED <>
\* apply committed configuration to node n
HandleCommit(n, m) ==
/\ m.method = Commit
/\ m.term = currentTerm[n]
/\ m.term = lastAcceptedTerm[n]
/\ m.version = lastAcceptedVersion[n]
/\ (electionWon[n] => lastAcceptedVersion[n] = lastPublishedVersion[n])
/\ lastCommittedConfiguration' = [lastCommittedConfiguration EXCEPT ![n] = lastAcceptedConfiguration[n]]
/\ UNCHANGED <>
\* crash/restart node n (loses ephemeral state)
RestartNode(n) ==
/\ joinVotes' = [joinVotes EXCEPT ![n] = {}]
/\ startedJoinSinceLastReboot' = [startedJoinSinceLastReboot EXCEPT ![n] = FALSE]
/\ electionWon' = [electionWon EXCEPT ![n] = FALSE]
/\ lastPublishedVersion' = [lastPublishedVersion EXCEPT ![n] = 0]
/\ lastPublishedConfiguration' = [lastPublishedConfiguration EXCEPT ![n] = lastAcceptedConfiguration[n]]
/\ publishVotes' = [publishVotes EXCEPT ![n] = {}]
/\ UNCHANGED <>
\* next-step relation
Next ==
\/ \E n \in Nodes : SetInitialState(n)
\/ \E n, nm \in Nodes : \E t \in Terms : HandleStartJoin(n, nm, t)
\/ \E m \in messages : HandleJoin(m.dest, m)
\/ \E n \in Nodes : \E t \in Terms : \E v \in Versions : \E val \in Values : \E vs \in ValidConfigs : HandleClientValue(n, t, v, val, vs)
\/ \E m \in messages : HandlePublishRequest(m.dest, m)
\/ \E m \in messages : HandlePublishResponse(m.dest, m)
\/ \E m \in messages : HandleCommit(m.dest, m)
\/ \E n \in Nodes : RestartNode(n)
----
\* Invariants
SingleNodeInvariant ==
\A n \in Nodes :
/\ lastAcceptedTerm[n] <= currentTerm[n]
/\ electionWon[n] = IsElectionQuorum(n, joinVotes[n]) \* cached value is consistent
/\ IF electionWon[n] THEN lastPublishedVersion[n] >= lastAcceptedVersion[n] ELSE lastPublishedVersion[n] = 0
/\ electionWon[n] => startedJoinSinceLastReboot[n]
/\ publishVotes[n] /= {} => electionWon[n]
OneMasterPerTerm ==
\A m1, m2 \in messages:
/\ m1.method = PublishRequest
/\ m2.method = PublishRequest
/\ m1.term = m2.term
=> m1.source = m2.source
LogMatching ==
\A m1, m2 \in messages:
/\ m1.method = PublishRequest
/\ m2.method = PublishRequest
/\ m1.term = m2.term
/\ m1.version = m2.version
=> m1.value = m2.value
CommittedPublishRequest(mp) ==
/\ mp.method = PublishRequest
/\ \E mc \in messages:
/\ mc.method = Commit
/\ mp.term = mc.term
/\ mp.version = mc.version
DescendantRelationIsStrictlyOrdered ==
\A d \in descendant:
/\ d.prevT <= d.nextT
/\ d.prevV < d.nextV
DescendantRelationIsTransitive ==
\A d1, d2 \in descendant:
d1.nextT = d2.prevT /\ d1.nextV = d2.prevV
=> [prevT |-> d1.prevT, prevV |-> d1.prevV, nextT |-> d2.nextT, nextV |-> d2.nextV] \in descendant
NewerOpsBasedOnOlderCommittedOps ==
\A m1, m2 \in messages :
/\ CommittedPublishRequest(m1)
/\ m2.method = PublishRequest
/\ m2.term >= m1.term
/\ m2.version > m1.version
=> [prevT |-> m1.term, prevV |-> m1.version, nextT |-> m2.term, nextV |-> m2.version] \in descendant
\* main invariant (follows from NewerOpsBasedOnOlderCommittedOps):
CommittedValuesDescendantsFromCommittedValues ==
\A m1, m2 \in messages :
/\ CommittedPublishRequest(m1)
/\ CommittedPublishRequest(m2)
/\ \/ m1.term /= m2.term
\/ m1.version /= m2.version
=>
\/ [prevT |-> m1.term, prevV |-> m1.version, nextT |-> m2.term, nextV |-> m2.version] \in descendant
\/ [prevT |-> m2.term, prevV |-> m2.version, nextT |-> m1.term, nextV |-> m1.version] \in descendant
CommittedValuesDescendantsFromInitialValue ==
\E v \in InitialVersions :
/\ \E n \in Nodes : v = initialAcceptedVersion[n]
/\ \E votes \in SUBSET(initialConfiguration) :
/\ IsQuorum(votes, initialConfiguration)
/\ \A n \in votes : initialAcceptedVersion[n] <= v
/\ \A m \in messages :
CommittedPublishRequest(m)
=>
[prevT |-> 0, prevV |-> v, nextT |-> m.term, nextV |-> m.version] \in descendant
CommitHasQuorumVsPreviousCommittedConfiguration ==
\A mc \in messages: mc.method = Commit
=> (\A mprq \in messages: (/\ mprq.method = PublishRequest
/\ mprq.term = mc.term
/\ mprq.version = mc.version)
=> IsQuorum({mprs.source: mprs \in {mprs \in messages: /\ mprs.method = PublishResponse
/\ mprs.term = mprq.term
/\ mprs.version = mprq.version
}}, mprq.commConf))
P2bInvariant ==
\A mc \in messages: mc.method = Commit
=> (\A mprq \in messages: mprq.method = PublishRequest
=> (mprq.term > mc.term => mprq.version > mc.version))
\* State-exploration limits
StateConstraint ==
/\ \A n \in Nodes: IF currentTerm[n] <= 1 THEN lastPublishedVersion[n] <= 2 ELSE lastPublishedVersion[n] <= 3
/\ Cardinality(messages) <= 15
====================================================================================================
================================================
FILE: ZenWithTerms/tla/ZenWithTerms.toolbox/.project
================================================
ZenWithTermstoolbox.builder.TLAParserBuildertoolbox.builder.PCalAlgorithmSearchingBuildertoolbox.natures.TLANatureZenWithTerms.tla1PARENT-1-PROJECT_LOC/ZenWithTerms.tla
================================================
FILE: ZenWithTerms/tla/ZenWithTerms.toolbox/.settings/org.lamport.tla.toolbox.prefs
================================================
ProjectRootFile=PARENT-1-PROJECT_LOC/ZenWithTerms.tla
eclipse.preferences.version=1
================================================
FILE: ZenWithTerms/tla/ZenWithTerms.toolbox/ZenWithTerms___model.launch
================================================
================================================
FILE: cluster/isabelle/Implementation.thy
================================================
section \Implementation\
text \This section presents the implementation of the algorithm.\
theory Implementation
imports Preliminaries
begin
subsection \Protocol messages\
text \The
proven-safe core of the protocol works by sending messages as described here. The remainder of the
protocol may send other messages too, and may drop, reorder or duplicate any of these messages, but
must not send these messages itself to ensure safety. Another way of thinking of these messages is
to consider them as ``fire-and-forget'' RPC invocations that, on receipt, call some local method, maybe
update the receiving node's state, and maybe yield some further messages. The @{type nat} parameter to each
message refers to a slot number.\
datatype TermOption = NO_TERM | SomeTerm Term
instantiation TermOption :: linorder
begin
fun less_TermOption :: "TermOption \ TermOption \ bool"
where "t < NO_TERM = False"
| "NO_TERM < SomeTerm t = True"
| "SomeTerm t\<^sub>1 < SomeTerm t\<^sub>2 = (t\<^sub>1 < t\<^sub>2)"
definition less_eq_TermOption :: "TermOption \ TermOption \ bool"
where "(t\<^sub>1 :: TermOption) \ t\<^sub>2 \ t\<^sub>1 = t\<^sub>2 \ t\<^sub>1 < t\<^sub>2"
instance proof
fix x y z :: TermOption
show "(x < y) = (x \ y \ \ y \ x)" unfolding less_eq_TermOption_def apply auto
using less_TermOption.elims apply fastforce
by (metis less_TermOption.elims(2) less_TermOption.simps(3) less_not_sym)
show "x \ x" by (simp add: less_eq_TermOption_def)
show "x \ y \ y \ z \ x \ z" unfolding less_eq_TermOption_def apply auto
by (metis TermOption.distinct(1) TermOption.inject dual_order.strict_trans less_TermOption.elims(2) less_TermOption.elims(3))
show "x \ y \ y \ x \ x = y" unfolding less_eq_TermOption_def apply auto
using \(x < y) = (x \ y \ \ y \ x)\ less_eq_TermOption_def by blast
show "x \ y \ y \ x" unfolding less_eq_TermOption_def apply auto
by (metis TermOption.distinct(1) TermOption.inject less_TermOption.elims(3) neqE)
qed
end
lemma NO_TERM_le [simp]: "NO_TERM \ t" by (cases t, simp_all add: less_eq_TermOption_def)
lemma le_NO_TERM [simp]: "(t \ NO_TERM) = (t = NO_TERM)" by (cases t, simp_all add: less_eq_TermOption_def)
lemma le_SomeTerm [simp]: "(SomeTerm t\<^sub>1 \ SomeTerm t\<^sub>2) = (t\<^sub>1 \ t\<^sub>2)" by (auto simp add: less_eq_TermOption_def)
datatype Message
= StartJoin Term
| Vote Slot Term TermOption
| ClientValue Value
| PublishRequest Slot Term Value
| PublishResponse Slot Term
| ApplyCommit Slot Term
| CatchUpRequest
| CatchUpResponse Slot "Node set" ClusterState
| DiscardJoinVotes
| Reboot
text \Some prose descriptions of these messages follows, in order to give a bit more of an
intuitive understanding of their purposes.\
text \The message @{term "StartJoin t"} may be sent by any node to attempt to start a master
election in the given term @{term t}.\
text \The message @{term "Vote i t a"} may be sent by a node in response
to a @{term StartJoin} message. It indicates that the sender knows all committed values for slots
strictly below @{term i}, and that the sender will no longer vote (i.e. send an @{term
PublishResponse}) in any term prior to @{term t}. The field @{term a} is either @{term
None} or @{term "Some t'"}. In the former case this indicates that
the node has not yet sent any @{term PublishResponse} message in slot @{term i}, and in the latter
case it indicates that the largest term in which it has previously sent an @{term PublishResponse}
message is @{term t'}. All
nodes must avoid sending a @{term Vote} message to two different masters in the same term.\
text \The message @{term "ClientValue x"} may be sent by any node and indicates an attempt to
reach consensus on the value @{term x}.\
text \The message @{term "PublishRequest i t v"} may be sent by the elected master of term
@{term t} to request the other master-eligible nodes to vote for value @{term v} to be committed in
slot @{term i}.\
text \The message @{term "PublishResponse i t"} may be sent by node in response to
the corresponding @{term PublishRequest} message, indicating that the sender votes for the value
proposed by the master of term @{term t} to be committed in slot @{term i}.\
text \The message @{term "ApplyCommit i t"} indicates that the value proposed by the master of
term @{term t} in slot @{term i} received a quorum of votes and is therefore committed.\
text \The message @{term Reboot} may be sent by any node to represent the restart of a node, which
loses any ephemeral state.\
text \The abstract model of Zen keeps track of the set of all messages that have ever been
sent, and asserts that this set obeys certain invariants, listed below. Further below, it will be
shown that these invariants imply that each slot obeys the @{term oneSlot} invariants above and
hence that each slot cannot see inconsistent committed values.\
datatype Destination = Broadcast | OneNode Node
record RoutedMessage =
sender :: Node
destination :: Destination
payload :: Message
text \It will be useful to be able to choose the optional term with the greater term,
so here is a function that does that.\
subsection \Node implementation\
text \Each node holds the following local data.\
record TermValue =
tvTerm :: Term
tvValue :: Value
record NodeData =
currentNode :: Node
currentTerm :: Term
(* committed state *)
firstUncommittedSlot :: Slot
currentVotingNodes :: "Node set"
currentClusterState :: ClusterState
(* accepted state *)
lastAcceptedData :: "TermValue option"
(* election state *)
joinVotes :: "Node set"
electionWon :: bool
(* publish state *)
publishPermitted :: bool
publishVotes :: "Node set"
definition lastAcceptedValue :: "NodeData \ Value"
where "lastAcceptedValue nd \ tvValue (THE lad. lastAcceptedData nd = Some lad)"
definition lastAcceptedTerm :: "NodeData \ TermOption"
where "lastAcceptedTerm nd \ case lastAcceptedData nd of None \ NO_TERM | Some lad \ SomeTerm (tvTerm lad)"
definition isQuorum :: "NodeData \ Node set \ bool"
where "isQuorum nd q \ q \ majorities (currentVotingNodes nd)"
lemma lastAcceptedValue_joinVotes_update[simp]: "lastAcceptedValue (joinVotes_update f nd) = lastAcceptedValue nd" by (simp add: lastAcceptedValue_def)
lemma lastAcceptedTerm_joinVotes_update[simp]: "lastAcceptedTerm (joinVotes_update f nd) = lastAcceptedTerm nd" by (simp add: lastAcceptedTerm_def)
lemma lastAcceptedValue_electionWon_update[simp]: "lastAcceptedValue (electionWon_update f nd) = lastAcceptedValue nd" by (simp add: lastAcceptedValue_def)
lemma lastAcceptedTerm_electionWon_update[simp]: "lastAcceptedTerm (electionWon_update f nd) = lastAcceptedTerm nd" by (simp add: lastAcceptedTerm_def)
text \This method publishes a value via a @{term PublishRequest} message.\
definition publishValue :: "Value \ NodeData \ (NodeData * Message option)"
where
"publishValue x nd \
if electionWon nd \ publishPermitted nd
then ( nd \ publishPermitted := False \
, Some (PublishRequest
(firstUncommittedSlot nd)
(currentTerm nd) x) )
else (nd, None)"
text \This method updates the node's current term (if necessary) and discards any data associated
with the previous term.\
definition ensureCurrentTerm :: "Term \ NodeData \ NodeData"
where
"ensureCurrentTerm t nd \
if t \ currentTerm nd
then nd
else nd
\ joinVotes := {}
, currentTerm := t
, electionWon := False
, publishPermitted := True
, publishVotes := {} \"
text \This method updates the node's state on receipt of a vote (a @{term Vote}) in an election.\
definition addElectionVote :: "Node \ Slot => TermOption \ NodeData \ NodeData"
where
"addElectionVote s i a nd \ let newVotes = insert s (joinVotes nd)
in nd \ joinVotes := newVotes
, electionWon := isQuorum nd newVotes \"
text \Clients request the cluster to achieve consensus on certain values using the @{term ClientValue}
message which is handled as follows.\
definition handleClientValue :: "Value \ NodeData \ (NodeData * Message option)"
where
"handleClientValue x nd \ if lastAcceptedTerm nd = NO_TERM then publishValue x nd else (nd, None)"
text \A @{term StartJoin} message is checked for acceptability and then handled by updating the
node's term and yielding a @{term Vote} message as follows.\
definition handleStartJoin :: "Term \ NodeData \ (NodeData * Message option)"
where
"handleStartJoin t nd \
if currentTerm nd < t
then ( ensureCurrentTerm t nd
, Some (Vote (firstUncommittedSlot nd)
t
(lastAcceptedTerm nd)))
else (nd, None)"
text \A @{term Vote} message is checked for acceptability and then handled as follows, perhaps
yielding a @{term PublishRequest} message.\
definition handleVote :: "Node \ Slot \ Term \ TermOption \ NodeData \ (NodeData * Message option)"
where
"handleVote s i t a nd \
if t = currentTerm nd
\ (i < firstUncommittedSlot nd
\ (i = firstUncommittedSlot nd \ a \ lastAcceptedTerm nd))
then let nd1 = addElectionVote s i a nd
in (if lastAcceptedTerm nd = NO_TERM then (nd1, None) else publishValue (lastAcceptedValue nd1) nd1)
else (nd, None)"
text \A @{term PublishRequest} message is checked for acceptability and then handled as follows,
yielding a @{term PublishResponse} message.\
definition handlePublishRequest :: "Slot \ Term \ Value \ NodeData \ (NodeData * Message option)"
where
"handlePublishRequest i t x nd \
if i = firstUncommittedSlot nd
\ t = currentTerm nd
then ( nd \ lastAcceptedData := Some \ tvTerm = t, tvValue = x \ \
, Some (PublishResponse i t))
else (nd, None)"
text \This method sends an @{term ApplyCommit} message if a quorum of votes has been received.\
definition commitIfQuorate :: "NodeData \ (NodeData * Message option)"
where
"commitIfQuorate nd = (nd, if isQuorum nd (publishVotes nd)
then Some (ApplyCommit (firstUncommittedSlot nd) (currentTerm nd)) else None)"
text \A @{term PublishResponse} message is checked for acceptability and handled as follows. If
this message, together with the previously-received messages, forms a quorum of votes then the
value is committed, yielding an @{term ApplyCommit} message.\
definition handlePublishResponse :: "Node \ Slot \ Term \ NodeData \ (NodeData * Message option)"
where
"handlePublishResponse s i t nd \
if i = firstUncommittedSlot nd \ t = currentTerm nd
then commitIfQuorate (nd \ publishVotes := insert s (publishVotes nd) \)
else (nd, None)"
text \This method updates the node's state when a value is committed.\
definition applyAcceptedValue :: "NodeData \ NodeData"
where
"applyAcceptedValue nd \ case lastAcceptedValue nd of
NoOp \ nd
| Reconfigure votingNodes \ nd
\ currentVotingNodes := set votingNodes
, electionWon := joinVotes nd \ majorities (set votingNodes) \
| ClusterStateDiff diff \ nd \ currentClusterState := diff (currentClusterState nd) \"
text \An @{term ApplyCommit} message is applied to the current node's state, updating its configuration
and \texttt{ClusterState} via the @{term applyValue} method. It yields no messages.\
definition handleApplyCommit :: "Slot \ Term \ NodeData \ NodeData"
where
"handleApplyCommit i t nd \
if i = firstUncommittedSlot nd \ lastAcceptedTerm nd = SomeTerm t
then (applyAcceptedValue nd)
\ firstUncommittedSlot := i + 1
, lastAcceptedData := None
, publishPermitted := True
, publishVotes := {} \
else nd"
definition handleCatchUpRequest :: "NodeData \ (NodeData * Message option)"
where
"handleCatchUpRequest nd = (nd, Some (CatchUpResponse (firstUncommittedSlot nd)
(currentVotingNodes nd) (currentClusterState nd)))"
definition handleCatchUpResponse :: "Slot \ Node set \ ClusterState \ NodeData \ NodeData"
where
"handleCatchUpResponse i conf cs nd \
if firstUncommittedSlot nd < i
then nd \ firstUncommittedSlot := i
, publishPermitted := True
, publishVotes := {}
, currentVotingNodes := conf
, currentClusterState := cs
, lastAcceptedData := None
, joinVotes := {}
, electionWon := False \
else nd"
text \A @{term Reboot} message simulates the effect of a reboot, discarding any ephemeral state but
preserving the persistent state. It yields no messages.\
definition handleReboot :: "NodeData \ NodeData"
where
"handleReboot nd \
\ currentNode = currentNode nd
, currentTerm = currentTerm nd
, firstUncommittedSlot = firstUncommittedSlot nd
, currentVotingNodes = currentVotingNodes nd
, currentClusterState = currentClusterState nd
, lastAcceptedData = lastAcceptedData nd
, joinVotes = {}
, electionWon = False
, publishPermitted = False
, publishVotes = {} \"
text \A @{term DiscardJoinVotes} message discards the votes received by a node. It yields
no messages.\
definition handleDiscardJoinVotes :: "NodeData \ NodeData"
where
"handleDiscardJoinVotes nd \ nd \ electionWon := False, joinVotes := {} \"
text \This function dispatches incoming messages to the appropriate handler method, and
routes any responses to the appropriate places. In particular, @{term Vote} messages
(sent by the @{term handleStartJoin} method) and
@{term PublishResponse} messages (sent by the @{term handlePublishRequest} method) are
only sent to a single node, whereas all other responses are broadcast to all nodes.\
definition ProcessMessage :: "NodeData \ RoutedMessage \ (NodeData * RoutedMessage option)"
where
"ProcessMessage nd msg \
let respondTo =
(\ d (nd, mmsg). case mmsg of
None \ (nd, None)
| Some msg \ (nd,
Some \ sender = currentNode nd, destination = d,
payload = msg \));
respondToSender = respondTo (OneNode (sender msg));
respondToAll = respondTo Broadcast
in
if destination msg \ { Broadcast, OneNode (currentNode nd) }
then case payload msg of
StartJoin t
\ respondToSender (handleStartJoin t nd)
| Vote i t a
\ respondToAll (handleVote (sender msg) i t a nd)
| ClientValue x
\ respondToAll (handleClientValue x nd)
| PublishRequest i t x
\ respondToSender (handlePublishRequest i t x nd)
| PublishResponse i t
\ respondToAll (handlePublishResponse (sender msg) i t nd)
| ApplyCommit i t
\ (handleApplyCommit i t nd, None)
| CatchUpRequest
\ respondToSender (handleCatchUpRequest nd)
| CatchUpResponse i conf cs
\ (handleCatchUpResponse i conf cs nd, None)
| DiscardJoinVotes
\ (handleDiscardJoinVotes nd, None)
| Reboot
\ (handleReboot nd, None)
else (nd, None)"
text \Nodes are initialised to this state. The data required is the initial configuration, @{term Q\<^sub>0}
and the initial \texttt{ClusterState}, here shown as @{term "ClusterState 0"}.\
definition initialNodeState :: "Node \ NodeData"
where "initialNodeState n =
\ currentNode = n
, currentTerm = 0
, firstUncommittedSlot = 0
, currentVotingNodes = V\<^sub>0
, currentClusterState = CS\<^sub>0
, lastAcceptedData = None
, joinVotes = {}
, electionWon = False
, publishPermitted = False
, publishVotes = {} \"
(* Note: publishPermitted could be True initially, but in the actual implementation we call the
same constructor whether we're starting up from afresh or recovering from a reboot, and the value
is really unimportant as we need to run an election in a new term before becoming master anyway,
so it's hard to justify putting any effort into calculating different values for these two cases.
Instead just set it to False initially.*)
end
================================================
FILE: cluster/isabelle/Monadic.thy
================================================
theory Monadic
imports Implementation "~~/src/HOL/Library/Monad_Syntax"
begin
datatype Exception = IllegalArgumentException
datatype ('e,'a) Result = Success 'a | Exception 'e
datatype 'a Action = Action "NodeData \ (NodeData * RoutedMessage list * (Exception,'a) Result)"
definition runM :: "'a Action \ NodeData \ (NodeData * RoutedMessage list * (Exception,'a) Result)"
where "runM ma \ case ma of Action unwrapped_ma \ unwrapped_ma"
lemma runM_Action[simp]: "runM (Action f) = f" by (simp add: runM_def)
lemma runM_inject[intro]: "(\nd. runM ma nd = runM mb nd) \ ma = mb" by (cases ma, cases mb, auto simp add: runM_def)
definition return :: "'a \ 'a Action" where "return a \ Action (\ nd. (nd, [], Success a))"
lemma runM_return[simp]: "runM (return a) nd = (nd, [], Success a)" unfolding runM_def return_def by simp
definition Action_bind :: "'a Action \ ('a \ 'b Action) \ 'b Action"
where "Action_bind ma mf \ Action (\ nd0. case runM ma nd0 of
(nd1, msgs1, result1) \ (case result1 of
Exception e \ (nd1, msgs1, Exception e)
| Success a \ (case runM (mf a) nd1 of
(nd2, msgs2, result2) \ (nd2, msgs1 @ msgs2, result2))))"
adhoc_overloading bind Action_bind
lemma runM_bind: "runM (a \ f) nd0 = (case runM a nd0 of (nd1, msgs1, result1) \ (case result1 of Exception e \ (nd1, msgs1, Exception e) | Success b \