Repository: Steamgjk/Nezha
Branch: main
Commit: 8db31f04af48
Files: 78
Total size: 416.5 KB
Directory structure:
gitextract_rknzum1x/
├── .github/
│ └── workflows/
│ └── build.yaml
├── .gitignore
├── .vscode/
│ └── settings.json
├── README.md
├── WORKSPACE
├── client/
│ ├── BUILD
│ ├── client.cc
│ ├── client.h
│ ├── client_config.h
│ └── client_run.cc
├── configs/
│ ├── dist/
│ │ ├── nezha-client-config.yaml
│ │ ├── nezha-proxy-config.yaml
│ │ ├── nezha-replica-config-0.yaml
│ │ ├── nezha-replica-config-1.yaml
│ │ ├── nezha-replica-config-2.yaml
│ │ └── nezha-replica-config.yaml
│ ├── local/
│ │ ├── nezha-client-config.yaml
│ │ ├── nezha-proxy-config.yaml
│ │ ├── nezha-replica-config-0.yaml
│ │ ├── nezha-replica-config-1.yaml
│ │ └── nezha-replica-config-2.yaml
│ ├── nezha-client-config-template.yaml
│ ├── nezha-proxy-config-template.yaml
│ └── nezha-replica-config-template.yaml
├── docs/
│ ├── Nezha.tla
│ ├── demo.md
│ └── tla-intro.md
├── external/
│ ├── gogoprotobuf.BUILD
│ └── googleapi.BUILD
├── lib/
│ ├── BUILD
│ ├── Rules.mk
│ ├── address.cc
│ ├── address.h
│ ├── common_struct.h
│ ├── common_type.h
│ ├── endpoint.cc
│ ├── endpoint.h
│ ├── message_handler.h
│ ├── message_type.cc
│ ├── message_type.h
│ ├── timer.h
│ ├── udp_socket_endpoint.cc
│ ├── udp_socket_endpoint.h
│ ├── utils.cc
│ ├── utils.h
│ └── zipfian.h
├── license.md
├── micro-bench/
│ ├── BUILD
│ ├── analysis.cc
│ ├── bench_receiver.cc
│ ├── bench_sender.cc
│ └── launch_micro.py
├── proto/
│ ├── BUILD
│ └── nezha_proto.proto
├── proxy/
│ ├── BUILD
│ ├── proxy.cc
│ ├── proxy.h
│ ├── proxy_config.h
│ └── proxy_run.cc
├── replica/
│ ├── BUILD
│ ├── replica.cc
│ ├── replica.h
│ ├── replica_config.h
│ └── replica_run.cc
├── scripts/
│ ├── analysis.py
│ ├── launch.py
│ ├── local_test.sh
│ └── ttcs-agent.cfg
├── third_party/
│ ├── concurrentqueue/
│ │ └── BUILD.bazel
│ ├── glog/
│ │ ├── BUILD.bazel
│ │ ├── BUILD.glog
│ │ └── glog.bzl
│ ├── junction/
│ │ ├── BUILD.bazel
│ │ └── junction.patch
│ ├── libev/
│ │ └── BUILD.bazel
│ ├── openssl/
│ │ └── BUILD.bazel
│ └── turf/
│ └── BUILD.bazel
└── ttcs-agent.cfg
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/workflows/build.yaml
================================================
# Bazel action to build & test specific targets.
name: Bazel build
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
build:
name: Bazel build and run local test
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v3
- name: Setup Bazel
run: |
sudo apt install -y apt-transport-https curl gnupg
curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor >bazel-archive-keyring.gpg
sudo mv bazel-archive-keyring.gpg /usr/share/keyrings
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/bazel-archive-keyring.gpg] https://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list
sudo apt update
sudo apt install -y bazel-5.2.0
sudo mv /usr/bin/bazel-5.2.0 /usr/bin/bazel
- name: Build
run: |
bazel build //replica/... //proxy/... //client/...
- name: Run local test
run: ./scripts/local_test.sh --github
================================================
FILE: .gitignore
================================================
/.obj
/.bin
/bazel-*
================================================
FILE: .vscode/settings.json
================================================
{
"C_Cpp.formatting": "clangFormat",
"C_Cpp.clang_format_fallbackStyle": "{BasedOnStyle: Google, IncludeBlocks: Preserve, DerivePointerAlignment: false, PointerAlignment: Left}",
"editor.formatOnSave": true,
"files.associations": {
"*.inc": "cpp",
"cctype": "cpp",
"clocale": "cpp",
"cmath": "cpp",
"cstdarg": "cpp",
"cstddef": "cpp",
"cstdio": "cpp",
"cstdlib": "cpp",
"cstring": "cpp",
"ctime": "cpp",
"cwchar": "cpp",
"cwctype": "cpp",
"array": "cpp",
"atomic": "cpp",
"bit": "cpp",
"*.tcc": "cpp",
"bitset": "cpp",
"chrono": "cpp",
"cinttypes": "cpp",
"condition_variable": "cpp",
"cstdint": "cpp",
"deque": "cpp",
"list": "cpp",
"map": "cpp",
"set": "cpp",
"unordered_map": "cpp",
"unordered_set": "cpp",
"vector": "cpp",
"exception": "cpp",
"algorithm": "cpp",
"functional": "cpp",
"iterator": "cpp",
"memory": "cpp",
"memory_resource": "cpp",
"numeric": "cpp",
"optional": "cpp",
"random": "cpp",
"ratio": "cpp",
"regex": "cpp",
"string": "cpp",
"string_view": "cpp",
"system_error": "cpp",
"tuple": "cpp",
"type_traits": "cpp",
"utility": "cpp",
"fstream": "cpp",
"initializer_list": "cpp",
"iomanip": "cpp",
"iosfwd": "cpp",
"iostream": "cpp",
"istream": "cpp",
"limits": "cpp",
"mutex": "cpp",
"new": "cpp",
"ostream": "cpp",
"shared_mutex": "cpp",
"sstream": "cpp",
"stdexcept": "cpp",
"streambuf": "cpp",
"thread": "cpp",
"typeinfo": "cpp",
"csignal": "cpp",
"any": "cpp",
"cfenv": "cpp",
"forward_list": "cpp",
"future": "cpp",
"scoped_allocator": "cpp",
"typeindex": "cpp",
"valarray": "cpp",
"variant": "cpp",
"hash_map": "cpp",
"hash_set": "cpp",
"*.ipp": "cpp",
"csetjmp": "cpp",
"strstream": "cpp",
"charconv": "cpp",
"codecvt": "cpp",
"complex": "cpp",
"source_location": "cpp",
"rope": "cpp",
"slist": "cpp"
}
}
================================================
FILE: README.md
================================================
# Nezha
----
Nezha (哪吒) is a legendary figure in Chinese mythology. Nezha has 3 heads and 6 arms, so he/she achieves much better fault tolerance than ordinary people :)
PS: We have created [[an FAQ page](https://github.com/Steamgjk/Nezha/wiki)]. Please take a look for a better understanding of Nezha.
## Paper and Presentation
Nezha: Deployable and High-Performance Consensus Using Synchronized Clocks (VLDB version) [[pdf](https://www.vldb.org/pvldb/vol16/p629-geng.pdf)]
Nezha: Deployable and High-Performance Consensus Using Synchronized Clocks(Technical Report) [[pdf](docs/Nezha-technical-report.pdf)]
An early presentation of Nezha was made at [Stanford Platform Lab Winter Review 2022](https://platformlab.stanford.edu/winter-review/platform-lab-winter-review-2022/) [[slides](https://platformlab.stanford.edu/wp-content/uploads/2022/03/Jinkun-Geng.pdf)]
If you find our work helpful to your research or project, we would very appreciate it if you could **add a star** to our repo and/or **cite our papers**. The bibs for the papers are as below.
```
@article{vldb23-nezha,
author = {Geng, Jinkun and Sivaraman, Anirudh and Prabhakar, Balaji and Rosenblum, Mendel},
title = {Nezha: Deployable and High-Performance Consensus Using Synchronized Clocks},
year = {2023},
journal = {Proceedings of the VLDB Endowment},
url = {https://www.vldb.org/pvldb/vol16/p629-geng.pdf},
publisher = {VLDB Endowment},
issn = {2150-8097},
volume = {16},
pages = {629-642},
numpages = {14}
}
@misc{nezha-tech,
author = {Geng, Jinkun and Sivaraman, Anirudh and Prabhakar, Balaji and Rosenblum, Mendel},
title = {Nezha: Deployable and High-Performance Consensus Using Synchronized Clocks},
doi = {10.48550/ARXIV.2206.03285},
url = {https://arxiv.org/abs/2206.03285},
publisher = {arXiv},
year = {2022},
}
```
## Clone Project
```
git clone --depth=1 https://github.com/Steamgjk/Nezha.git
```
## File Structure
The core part includes three modules (folders), i.e.,
- replica
- proxy
- client
Each module is composed of three files:
- a header file (e.g., replica.h),
- a source implementation file (replica.cc),
- a launching file (e.g., replica_run.cc).
Each process reads an independent yaml file (e.g., nezha-replica-config-0.yaml) to get its full configuration, the sample configuration files are placed in the configs folder
## Install Bazel
We use Bazel 5.2.0 for building Nezha.
```
# Install bazel 5.2.0
# Please follow the instructions at https://bazel.build/install/ubuntu#install-on-ubuntu,
# or simply run the following commands
sudo apt install -y apt-transport-https curl gnupg
curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor >bazel-archive-keyring.gpg
sudo mv bazel-archive-keyring.gpg /usr/share/keyrings
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/bazel-archive-keyring.gpg] https://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list
sudo apt update
sudo apt install -y bazel-5.2.0
sudo mv /usr/bin/bazel-5.2.0 /usr/bin/bazel
bazel --version
```
## Build Nezha with Bazel
Since Bazel is becoming popular, we have migrated nezha from Makefile-based building system to the bazel building system. The bazel version in use is 5.2.0
```
cd Nezha && bazel build //replica/... //client/... //proxy/...
```
After building the project successfully, the executable files will be generated in the folder named `bazel-bin`
## Single-Machine Tests
Please refer to [the single-machine instructions](docs/demo.md) to run Nezha under various scenarios (view change, request commit, recovery from failure of replica).
## Multi-Machine Tests
We use [scripts/launch.py](scripts/launch.py) to conduct distributed tests across multiple machines. After the tests have completed, [scripts/analysis.py](scripts/analysis.py) is used to analyze the results to generate performance numbers. The current scripts only support Google Cloud Platform (GCP). They require GCP credentials to create and delete VMs on GCP.
## Important Configuration Parameters
### Replica
- ```replica-ips``` must include 2f+1 ips
- ```replica-id``` starts from 0 to 2f
- ```index-transfer-batch```, ```request-key-transfer-batch```, ```request-transfer-batch```. The values of the three batch parameters should be carefully chosen in order not to overflow the [maximum size of UDP packets](https://stackoverflow.com/questions/1098897/what-is-the-largest-safe-udp-packet-size-on-the-internet).
### Clients
- We support two types of clients, i.e., open-loop clients and closed-loop clients.
- Open-loop clients generate requests according to a Poisson process configured with a specific rate.
- Closed-loop clients use a sliding window protocol to keep a fixed number of requests in flight at any given time, release a new request when an old one is completed.
- ```is-openloop```: When this flag is true, --poission-rate becomes meaningful.
- ```skew-factor``` and key-number decides the workload, which further affects the commutativity optimization
### Proxy
- ```shard-num``` decides how many threads will be launched. 1 shard includes 1 forwarding thread to forward client requests to replicas and 1 replying thread to receive and replies from replicas and does quorum check
- ```max-owd``` is used in the clamping function to estimate one-way delay, more details are described in Sec 4 [Adpative latency bound] of the paper.
## Performance Benchmark
Refer to [our paper](https://arxiv.org/pdf/2206.03285.pdf) for the relevant performance stats. Compared with the experimental version, we have refactored the codebase with some higher-performance libraries (e.g. libev instead of libevent) and data structures (e.g., ConcurrentMap and ConcurrentQueue). Besides, we have also conducted further optimization with the pipeline. The performance will be somewhat better than the original version used in the paper. New benchmark data will be updated soon.
## Authors and Acknowledgment
Nezha project is developed and maintained by [Jinkun Geng](https://steamgjk.github.io/) and his three supervisors, i.e., [Prof. Anirudh Sivaraman](https://cs.nyu.edu/~anirudh/), [Prof. Balaji Prabhakar](https://web.stanford.edu/~balaji/) and [Prof. Mendel Rosenblum](http://web.stanford.edu/~mendel/).
We are fortunate to get the help from many researchers during the development of Nezha. Below we list and acknowledge them according to the timeline.
[Dr. Shiyu Liu](https://web.stanford.edu/~shiyuliu/) and [Dr. Feiran Wang](https://www.linkedin.com/in/feiran-wang/) joined the discussion during the early design of Nezha. Feiran explained the details of CRaft and the related correctness properties. Shiyu explained the principles of Huygens and the other clock sync solutions.
[Prof. Dan Ports](https://drkp.net/), [Prof. Jialin Li](https://www.comp.nus.edu.sg/~lijl/) and [Dr. Ellis Michael](https://ellismichael.com/) provided helpful discussion related to Speculative Paxos and NOPaxos. Dan also gave us the pointer to crash vector and diskless recovery.
[Prof. Jinyang Li](http://www.news.cs.nyu.edu/~jinyang/) listened to our early presentation of Nezha, and gave some useful feedback.
[Prof. Seo Jin Park](https://seojinpark.net/) discussed with us about the definition of linearizability and other correctness properties. Seo Jin also provided some explanation about CURP.
[Prof. Zhaoguo Wang](https://ipads.se.sjtu.edu.cn/pub/members/zhaoguo_wang) shared with us his experience in testing Raft.
The [Derecho team](https://derecho-project.github.io/) (Prof. Ken Birman, Dr. Weijia Song, Dr. Sagar Jha, Dr. Lorenzo Rosa, etc) offered technical support and discussion during our measurement of Derecho.
The [ClockWork](https://www.clockwork.io/) Staff (Dr. Yilong Geng and Dr. Deepak Merugu) offered technical support in deploying Huygens. Dr. Deepak Merugu also gave suggestions on the coding-styles of Nezha codebase. Katie Gioioso provided feedback on Nezha design. Bhagirath Mehta participated in the single-machine test of Nezha.
[Prof. Eugene Wu](http://www.cs.columbia.edu/~ewu/) provided suggestions on the revision of Nezha paper.
[Prof. Aurojit Panda](https://cs.nyu.edu/~apanda/) discussed with us about Nezha's correctness during leader change. Aurojit reviewed our draft and offered some constructive suggestions on the revision.
The [Raft community](https://groups.google.com/u/1/g/raft-dev/c/SmnAvZMufB0) offered much insightful discussion for us. Many community members discussed with us and helped to justify our design decisions about Nezha.
## License
Please refer to [license.md](license.md)
## Future Plan
(1) Conduct more functionality and performance tests to make Nezha more robust and optimized
(3) Replace [the etcd backend for Kubenetes](https://learnk8s.io/etcd-kubernetes) to boost the performance of Kubenetes.
================================================
FILE: WORKSPACE
================================================
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository", "new_git_repository")
http_archive(
name = "rules_proto",
sha256 = "e017528fd1c91c5a33f15493e3a398181a9e821a804eb7ff5acdd1d2d6c2b18d",
strip_prefix = "rules_proto-4.0.0-3.20.0",
urls = [
"https://github.com/bazelbuild/rules_proto/archive/refs/tags/4.0.0-3.20.0.tar.gz",
],
)
load("@rules_proto//proto:repositories.bzl", "rules_proto_dependencies", "rules_proto_toolchains")
rules_proto_dependencies()
rules_proto_toolchains()
http_archive(
name = "com_github_grpc_grpc",
sha256 = "9f387689b7fdf6c003fd90ef55853107f89a2121792146770df5486f0199f400",
urls = [
"https://github.com/grpc/grpc/archive/refs/tags/v1.42.0.zip",
],
strip_prefix = "grpc-1.42.0",
)
load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
grpc_deps()
load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
grpc_extra_deps()
http_archive(
name = "googleapi",
sha256 = "3ff2365822fb573cb1779ada5c2ac7899269cacd0836aef95ffe9d95779031f2",
url = "https://github.com/googleapis/googleapis/archive/refs/tags/common-protos-1_3_1.zip",
strip_prefix = "googleapis-common-protos-1_3_1/",
build_file="@//external:googleapi.BUILD",
)
http_archive(
name = "etcd",
sha256 = "580ce584dc7628efebb57f8c8240674918d334ad21e33186bbc5f6348f465bc1",
url = "https://github.com/etcd-io/etcd/archive/refs/tags/v3.5.0.zip",
strip_prefix = "etcd-3.5.0/",
build_file="@//external:etcd.BUILD",
)
http_archive(
name = "gogoprotobuf",
sha256 = "f89f8241af909ce3226562d135c25b28e656ae173337b3e58ede917aa26e1e3c",
url = "https://github.com/gogo/protobuf/archive/refs/tags/v1.3.2.zip",
strip_prefix = "protobuf-1.3.2/",
build_file="@//external:gogoprotobuf.BUILD",
)
git_repository(
name = "com_github_jbeder_yaml_cpp",
commit = "fcbb8193b94921e058be7b563aea053531e5b2d9", # 19-Aug-2023
remote = "https://github.com/jbeder/yaml-cpp.git",
shallow_since = "1692473776 -0400",
)
new_git_repository(
name = "com_github_cameron314_concurrentqueue",
build_file = "//third_party/concurrentqueue:BUILD.bazel",
commit = "6dd38b8a1dbaa7863aa907045f32308a56a6ff5d",
shallow_since = "1686439287 -0400",
remote = "https://github.com/cameron314/concurrentqueue.git",
)
new_git_repository(
name = "com_github_preshing_junction",
commit = "5ad3be7ce1d3f16b9f7ed6065bbfeacd2d629a08",
shallow_since = "1518982100 -0500",
patches = ["//third_party/junction:junction.patch"],
patch_args = ["-p1"],
build_file = "//third_party/junction:BUILD.bazel",
remote = "https://github.com/preshing/junction",
)
new_git_repository(
name = "com_github_preshing_turf",
commit = "9ae0d4b984fa95ed5f823274b39c87ee742f6650",
shallow_since = "1484317994 -0500" ,
build_file = "//third_party/turf:BUILD.bazel",
remote = "https://github.com/preshing/turf",
)
new_git_repository(
name = "com_github_enki_libev",
commit = "93823e6ca699df195a6c7b8bfa6006ec40ee0003",
shallow_since = "1463172876 -0700",
build_file = "//third_party/libev:BUILD.bazel",
remote = "https://github.com/enki/libev.git",
)
# Google gflags.
git_repository(
name = "com_github_gflags_gflags",
commit = "e171aa2d15ed9eb17054558e0b3a6a413bb01067", # 11-Nov-2018
remote = "https://github.com/gflags/gflags.git",
shallow_since = "1541971260 +0000",
)
# Google glog.
new_git_repository(
name = "com_github_google_glog",
build_file = "//third_party/glog:BUILD.glog",
commit = "ba8a9f6952d04d1403b97df24e6836227751454e", # 7-May-2019
remote = "https://github.com/google/glog.git",
# Shallow since doesn't work here for some weird reason. See
# https://github.com/bazelbuild/bazel/issues/10292
# shallow_since = "1557212520 +0000",
)
# Google protobuf.
git_repository(
name = "com_google_protobuf",
commit = "21027a27c4c2ec1000859ccbcfff46d83b16e1ed", # 21-Apr-2022, v3.20.1
remote = "https://github.com/protocolbuffers/protobuf",
shallow_since = "1650589240 +0000",
)
http_archive(
name = "rules_foreign_cc",
sha256 = "2a8000ce03dd9bb324bc9bb7f1f5d01debac406611f4d9fedd385192718804f0",
strip_prefix = "rules_foreign_cc-60813d57a0e99be1a009c1a0e9627cdbe81fcd19",
url = "https://github.com/bazelbuild/rules_foreign_cc/archive/60813d57a0e99be1a009c1a0e9627cdbe81fcd19.tar.gz",
)
load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies")
rules_foreign_cc_dependencies()
http_archive(
name = "openssl",
build_file = "//third_party/openssl:BUILD.bazel",
sha256 = "23011a5cc78e53d0dc98dfa608c51e72bcd350aa57df74c5d5574ba4ffb62e74",
strip_prefix = "openssl-OpenSSL_1_1_1d",
urls = ["https://github.com/openssl/openssl/archive/OpenSSL_1_1_1d.tar.gz"],
)
http_archive(
name = "com_github_nelhage_rules_boost",
url = "https://github.com/nelhage/rules_boost/archive/96e9b631f104b43a53c21c87b01ac538ad6f3b48.tar.gz",
strip_prefix = "rules_boost-96e9b631f104b43a53c21c87b01ac538ad6f3b48",
sha256 = "5ea00abc70cdf396a23fb53201db19ebce2837d28887a08544429d27783309ed",
)
load("@com_github_nelhage_rules_boost//:boost/boost.bzl", "boost_deps")
boost_deps()
================================================
FILE: client/BUILD
================================================
load("@rules_proto//proto:defs.bzl", "proto_library")
cc_library(
name = "client_config",
hdrs = ["client_config.h"],
deps = [
"@com_github_jbeder_yaml_cpp//:yaml-cpp",
],
)
cc_library(
name = "client_class",
srcs = ["client.cc"],
hdrs = ["client.h"],
deps = [
"//proto:nezha_cc_proto",
"//lib:zipfian",
"//lib:utils",
":client_config",
],
)
cc_binary(
name = "nezha_client",
srcs = ["client_run.cc"],
deps = [
":client_class",
],
)
================================================
FILE: client/client.cc
================================================
#include "client/client.h"
namespace nezha {
Client::Client(const std::string& configFile) {
hop3s.reserve(500000);
hop4s.reserve(500000);
totals.reserve(500000);
LOG(INFO) << "Loading config information from " << configFile;
std::string error = clientConfig_.parseConfig(configFile);
if (error != "") {
LOG(ERROR) << "Error loading client config: " << error << " Exiting.";
exit(1);
}
clientId_ = clientConfig_.clientId;
LOG(INFO) << "clientId=" << clientId_;
std::string clientIP = clientConfig_.clientIp;
LOG(INFO) << "clientIP=" << clientIP;
int requestPort = clientConfig_.requestPort;
LOG(INFO) << "requestPort=" << requestPort;
LOG(INFO) << "endPointType=" << clientConfig_.endpointType;
requestEP_ =
CreateEndpoint(clientConfig_.endpointType, clientIP, requestPort, true);
replyHandler_ = CreateMsgHandler(
clientConfig_.endpointType,
[](MessageHeader* msgHdr, char* msgBuffer, Address* sender, void* ctx) {
((Client*)ctx)->ReceiveReply(msgHdr, msgBuffer, sender);
},
this);
monitorTimer_ = new Timer(
[](void* ctx, void* receiverEP) {
// LOG(INFO) << "Monitor running " << ((Client*)ctx)->running_;
if (((Client*)ctx)->running_ == false) {
((Endpoint*)receiverEP)->LoopBreak();
}
},
10 /*Checks the status every 10ms*/, this);
/** Fetch the addreses of all proxies and organize them as a two-dimensional
* vector */
proxyAddrs_.resize(clientConfig_.proxyIps.size());
for (uint32_t i = 0; i < proxyAddrs_.size(); i++) {
proxyAddrs_[i].resize(clientConfig_.proxyShardNum);
for (uint32_t j = 0; j < proxyAddrs_[i].size(); j++) {
proxyAddrs_[i][j] = new Address(clientConfig_.proxyIps[i],
clientConfig_.proxyRequestPortBase + j);
}
}
/** If the client is a open-loop client, generate the poission trace for the
* client */
if (clientConfig_.isOpenLoop) {
poissonRate_ = clientConfig_.poissonRate;
LOG(INFO) << "OpenLoop Client rate=" << poissonRate_;
poissonTrace_.resize(1000, 0);
std::default_random_engine generator(clientId_); // clientId as the seed
std::poisson_distribution distribution(poissonRate_);
for (int i = 0; i < 1000; i++) {
int reqNum = distribution(generator);
if (reqNum < 0) {
poissonTrace_[i] = 0;
} else {
poissonTrace_[i] = reqNum;
}
}
}
/** Generate zipfian workload */
LOG(INFO) << "keyNum=" << clientConfig_.keyNum
<< "\tskewFactor=" << clientConfig_.skewFactor
<< "\twriteRatio=" << clientConfig_.writeRatio;
zipfianKeys_.resize(1000000, 0);
retryTimeoutUs_ = clientConfig_.requestRetryTimeUs;
if (clientConfig_.keyNum > 1) {
std::default_random_engine generator(clientId_); // clientId as the seed
zipfian_int_distribution zipfianDistribution(
0, clientConfig_.keyNum - 1, clientConfig_.skewFactor);
for (uint32_t i = 0; i < zipfianKeys_.size(); i++) {
zipfianKeys_[i] = zipfianDistribution(generator);
}
}
/** Initialize */
committedReqId_ = 0;
reclaimedReqId_ = 0;
nextReqId_ = 1;
retryNumber_ = 0;
committedNum_ = 0;
fastCommitNum_ = 0;
fastWriteNum_ = 0;
}
void Client::Run() {
running_ = true;
LaunchThreads();
for (auto& kv : threadPool_) {
LOG(INFO) << "Join " << kv.first;
kv.second->join();
LOG(INFO) << "Join Complete " << kv.first;
}
LOG(INFO) << "Run Terminated ";
}
void Client::LaunchThreads() {
threadPool_["LogTd"] = new std::thread(&Client::LogTd, this);
threadPool_["ProcessReplyTd"] =
new std::thread(&Client::ProcessReplyTd, this);
if (clientConfig_.isOpenLoop) {
LOG(INFO) << "OpenLoop Client";
threadPool_["OpenLoopSubmissionTd"] =
new std::thread(&Client::OpenLoopSubmissionTd, this);
} else {
LOG(INFO) << "ClosedLoop Client";
threadPool_["CloseLoopSubmissionTd"] =
new std::thread(&Client::CloseLoopSubmissionTd, this);
}
}
void Client::ProcessReplyTd() {
/** Register the message handler and timer. Then this thread will run in an
* event-driven mode, i.e, when message comes, it calls the registered message
* handler */
requestEP_->RegisterMsgHandler(replyHandler_);
requestEP_->RegisterTimer(monitorTimer_);
LOG(INFO) << "Loop Run ";
requestEP_->LoopRun();
LOG(INFO) << "Loop Run Exit ";
}
void Client::ReceiveReply(MessageHeader* msgHdr, char* msgBuffer,
Address* sender) {
if (msgHdr->msgLen < 0) {
return;
}
Reply reply;
if (msgHdr->msgType == MessageType::COMMIT_REPLY &&
reply.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
committedNum_++;
if (reply.replytype() == MessageType::FAST_REPLY) {
fastCommitNum_++;
if (reply.iswrite()) {
fastWriteNum_++;
}
}
// if (committedNum_ % 100000 == 0) {
// LOG(INFO) << "commitNum=" << committedNum_
// << "\tfastWriteNum_=" << fastWriteNum_
// << "\tFastCommitNum=" << fastCommitNum_ <<
// "\tWriteRatioCommit="
// << (fastWriteNum_ * 100.0 / fastCommitNum_)
// << "\t fastRatio=" << (fastCommitNum_ * 100.0 /
// committedNum_);
// }
if (committedReqId_ < reply.reqid()) {
committedReqId_ = reply.reqid();
// // LOG(INFO) << "committedReqId_=" << committedReqId_;
// uint64_t st = outstandingRequestSendTime_.get(reply.reqid());
// uint64_t et = GetMicrosecondTimestamp();
// ls.push_back((et - st));
// if (ls.size() >= 1000) {
// for (uint32_t i = 0; i < 1000; i++) {
// printf("%u\t", ls[i]);
// if (i % 20 == 0) {
// printf("\n");
// }
// }
// exit(0);
// }
}
uint64_t sendTime = outstandingRequestSendTime_.get(reply.reqid());
if (sendTime > 0) {
/** The corresponding request has not been committed, because it is still
* in outstandingRequestSendTime_, so we wan to mark it as committed,
* i.e., erase from outstandingRequestSendTime_
*/
/**
* Generate log information and pass to logQu_, which will be handled by
* LogTd
* */
uint64_t recvTime = GetMicrosecondTimestamp();
LogInfo* log = new LogInfo();
lastCommittedReqId_ = reply.reqid();
*log = {reply.reqid(), sendTime, recvTime, reply.replytype()};
outstandingRequestSendTime_.erase(reply.reqid());
logQu_.enqueue(log);
}
}
}
void Client::OpenLoopSubmissionTd() {
int roundRobinIdx = 0;
uint64_t startTime = GetMicrosecondTimestamp();
uint64_t endTime = startTime + clientConfig_.durationSec * 1000000;
srandom(clientId_);
endTime += 10 * 1000ul * 1000ul;
LOG(INFO) << "Expected to end at " << endTime;
// Poisson rate is ``10ms as one unit''
for (uint32_t i = 0; i < clientConfig_.durationSec * 100; i++) {
if (!running_) {
return;
}
if (GetMicrosecondTimestamp() >= endTime) {
// Client has executed long enough, should terminate
LOG(INFO) << "Terminating soon...";
running_ = false;
return;
}
uint32_t reqNum = poissonTrace_[i % poissonTrace_.size()];
if (reqNum <= 0) {
usleep(10000);
continue;
}
uint32_t intval = 10000 / reqNum;
uint64_t startTime = GetMicrosecondTimestamp();
for (uint32_t j = 0; j < reqNum; j++) {
while (GetMicrosecondTimestamp() < startTime + j * intval) {
}
// Send the request
uint32_t mapIdx =
roundRobinIdx % (proxyAddrs_.size() * proxyAddrs_[0].size());
Request* request = NULL;
if (retryQu_.try_dequeue(request)) {
// Retry this request
Address* roundRobinAddr = proxyAddrs_[mapIdx % proxyAddrs_.size()]
[mapIdx / proxyAddrs_.size()];
// LOG(INFO) << "Resend " << request->reqid() << "to "
// << mapIdx % proxyAddrs_.size() << "\t"
// << mapIdx / proxyAddrs_.size();
requestEP_->SendMsgTo(*roundRobinAddr, *request,
MessageType::CLIENT_REQUEST);
outstandingRequestSendTime_.assign(request->reqid(),
GetMicrosecondTimestamp());
roundRobinIdx++;
} else {
// submit new requests
request = new Request();
request->set_clientid(clientId_);
request->set_reqid(nextReqId_);
if (random() % 100 < 100 * writeRatio_) {
request->set_iswrite(true);
} else {
request->set_iswrite(false);
}
request->set_key(zipfianKeys_[nextReqId_ % zipfianKeys_.size()]);
// // if (nextReqId_ % 10 == 1 && clientId_ <= 10) {
// if (clientId_ <= 12) {
// if (nextReqId_ % 2 == 1)
// request->set_iswrite(true);
// else
// request->set_iswrite(false);
// // request->set_iswrite(true);
// // LOG(INFO) << "One Write " << request->key()
// // << " reqId=" << request->reqid();
// } else {
// exit(0);
// }
// request->set_key(nextReqId_ % 100000 + 100000 * (clientId_ - 1));
Address* roundRobinAddr = proxyAddrs_[mapIdx % proxyAddrs_.size()]
[mapIdx / proxyAddrs_.size()];
// LOG(INFO) << "Sed " << request->reqid() << "to "
// << mapIdx % proxyAddrs_.size() << "\t"
// << mapIdx / proxyAddrs_.size();
requestEP_->SendMsgTo(*roundRobinAddr, *request,
MessageType::CLIENT_REQUEST);
outstandingRequests_.assign(request->reqid(), request);
outstandingRequestSendTime_.assign(request->reqid(),
GetMicrosecondTimestamp());
nextReqId_++;
roundRobinIdx++;
}
}
}
LOG(INFO) << "Terminating soon... after "
<< (endTime - GetMicrosecondTimestamp()) * 1e-6 << " seconds";
while (GetMicrosecondTimestamp() < endTime) {
// Client has executed long enough, should terminate
usleep(1000);
}
running_ = false;
}
void Client::CloseLoopSubmissionTd() {
int roundRobinIdx = 0;
uint64_t startTime = GetMicrosecondTimestamp();
uint64_t endTime = startTime + clientConfig_.durationSec * 1000000;
endTime += 10 * 1000ul * 1000ul;
LOG(INFO) << "Expected to end at " << endTime;
srand(clientId_);
while (running_) {
if (GetMicrosecondTimestamp() >= endTime) {
// Client has executed long enough, should terminate
LOG(INFO) << "Terminating soon...";
running_ = false;
return;
}
Request* request = NULL;
uint32_t mapIdx =
roundRobinIdx % (proxyAddrs_.size() * proxyAddrs_[0].size());
if (nextReqId_ == committedReqId_ + 1) {
// submit new request
request = new Request();
request->set_clientid(clientId_);
request->set_reqid(nextReqId_);
if (random() % 100 < 100 * writeRatio_) {
request->set_iswrite(true);
} else {
request->set_iswrite(false);
}
request->set_key(zipfianKeys_[nextReqId_ % zipfianKeys_.size()]);
Address* roundRobinAddr =
proxyAddrs_[mapIdx % proxyAddrs_.size()][mapIdx / proxyAddrs_.size()];
requestEP_->SendMsgTo(*roundRobinAddr, *request,
MessageType::CLIENT_REQUEST);
outstandingRequests_.assign(request->reqid(), request);
outstandingRequestSendTime_.assign(request->reqid(),
GetMicrosecondTimestamp());
nextReqId_++;
roundRobinIdx++;
} else {
if (retryQu_.try_dequeue(request)) {
// have some requests to retry
Address* roundRobinAddr = proxyAddrs_[mapIdx % proxyAddrs_.size()]
[mapIdx / proxyAddrs_.size()];
requestEP_->SendMsgTo(*roundRobinAddr, *request,
MessageType::CLIENT_REQUEST);
outstandingRequestSendTime_.assign(request->reqid(),
GetMicrosecondTimestamp());
roundRobinIdx++;
}
}
}
LOG(INFO) << "Terminating soon... after "
<< (endTime - GetMicrosecondTimestamp()) * 1e-6 << " seconds";
while (GetMicrosecondTimestamp() < endTime) {
// Client has executed long enough, should terminate
usleep(1000);
}
running_ = false;
}
void Client::LogTd() {
LogInfo* log = NULL;
uint64_t startTime, endTime;
uint32_t lastSubmitteddReqId = 0;
uint32_t lastCountCommitedReq = 0;
uint32_t latencySample = 0;
std::ofstream ofs("Client-Stats-" + std::to_string(clientId_));
ofs << "ReqId,SendTime,CommitTime,CommitType" << std::endl;
startTime = GetMicrosecondTimestamp();
while (running_) {
endTime = GetMicrosecondTimestamp();
if (endTime - startTime >= 5000000) {
float duration = (endTime - startTime) * 1e-6;
uint32_t submittedReqNum = nextReqId_ - 1 - lastSubmitteddReqId;
uint32_t committedReqNum = committedNum_ - lastCountCommitedReq;
float submissionRate = submittedReqNum / duration;
float commitRate = committedReqNum / duration;
lastSubmitteddReqId = nextReqId_ - 1;
lastCountCommitedReq = committedNum_;
startTime = endTime;
LOG(INFO) << "endTime=" << endTime << "\t"
<< "committedNum_ = " << committedNum_ << "\t"
<< "logQuLen =" << logQu_.size_approx() << "\t"
<< "committedReqId_=" << committedReqId_ << "\t"
<< "nextReqId_=" << nextReqId_ << "\t"
<< "lastCommittedReqId_=" << lastCommittedReqId_ << "\t"
<< "submissionRate=" << submissionRate << " req/sec\t"
<< "commitRate=" << commitRate << " req/sec"
<< "\t"
<< "FastCommitRatio=" << fastCommitNum_ * 100.0 / committedNum_
<< "\t"
<< "latency(Sample)=" << latencySample << " us"
<< "\t"
<< "retryNum=" << retryNumber_;
ofs.flush();
}
if (logQu_.try_dequeue(log)) {
// LOG(INFO) << "committedReqId_=" << committedReqId_ << "\t" << "reqId="
// << log->reqId;
while (committedReqId_ + 1 <= log->reqId) {
if (outstandingRequestSendTime_.get(committedReqId_ + 1) == 0) {
// this reqId has also been committed (i.e. cannot find its footprint)
// advance committedReqId;
committedReqId_++;
} else {
break;
}
}
latencySample = log->commitTime - log->sendTime;
// log stats
ofs << log->toString() << std::endl;
delete log;
}
// // Check whether any requests need retry
// for (uint32_t reqId = committedReqId_ + 1; reqId < nextReqId_; reqId++) {
// uint64_t sendTime = outstandingRequestSendTime_.get(reqId);
// if (sendTime > 0) {
// // Find it
// if (GetMicrosecondTimestamp() - sendTime > retryTimeoutus_) {
// // timeout, should retry
// Request* request = outstandingRequests_.get(reqId);
// LOG(INFO) << "Timeout Retry " << request->reqid();
// outstandingRequestSendTime_.erase(reqId);
// retryQu_.enqueue(request);
// retryNumber_++;
// }
// }
// }
while (reclaimedReqId_ + 1000 < committedReqId_) {
// do not reclaim request too aggressive
// If we reclaim too aggressive, there can be some edge case of dangling
// request pointer
Request* request = outstandingRequests_.get(reclaimedReqId_);
if (request) {
outstandingRequests_.erase(request->reqid());
delete request;
}
reclaimedReqId_++;
}
}
LOG(INFO) << "The runtime have been terminated, we still need to dump "
<< logQu_.size_approx() << " Logs before exit";
uint32_t cnt = 0;
while (logQu_.try_dequeue(log)) {
// log stats
ofs << log->toString() << std::endl;
delete log;
cnt++;
if (cnt % 10000 == 0) {
LOG(INFO) << "Remaining Log Number " << logQu_.size_approx();
ofs.flush();
}
}
ofs.flush();
LOG(INFO) << "Dump Finished";
}
void Client::Terminate() {
LOG(INFO) << "Terminating...";
running_ = false;
}
Client::~Client() {
for (auto& kv : threadPool_) {
delete kv.second;
}
while (reclaimedReqId_ <= nextReqId_) {
Request* request = outstandingRequests_.get(reclaimedReqId_);
if (request) {
outstandingRequests_.erase(request->reqid());
delete request;
}
reclaimedReqId_++;
}
}
} // namespace nezha
================================================
FILE: client/client.h
================================================
#include
#include
#include
#include "client_config.h"
#include "lib/utils.h"
#include "lib/zipfian.h"
#include "proto/nezha_proto.pb.h"
namespace nezha {
using namespace nezha::proto;
/** LogInfo is used to dump some performance stats, which can be extended to
* include more metrics */
struct LogInfo {
uint32_t reqId;
uint64_t sendTime;
uint64_t commitTime;
uint32_t commitType;
std::string toString() {
std::string ret =
(std::to_string(reqId) + "," + std::to_string(sendTime) + "," +
std::to_string(commitTime) + "," + std::to_string(commitType));
return ret;
}
};
/**
* Refer to client_run.cc, the runnable program only needs to instantiate a
* client object with a configuration file. Then it calls Run() method to run
* and calls Terminate() method to stop
*/
class Client {
private:
/** All the configuration parameters for client are included in
* clientConfig_*/
ClientConfig clientConfig_;
/** Each thread is given a unique name (key) and stored in the pool */
std::map threadPool_;
/** The endpoint uses to submit request to proxies */
Endpoint* requestEP_;
/** The message handler used to handle replies (from proxies) */
struct MessageHandler* replyHandler_;
/** The timer periodically monitor the status of the client, and break the
* blocking endpoint when the client is about to terminate */
struct Timer* monitorTimer_;
/** Flag to Run/Terminate threads */
std::atomic running_;
/** Each client is assigned with a unqiue id */
int clientId_;
/** Open-Loop submission related: the client's submission rate follows a
* poisson distribution. We use 10ms as the basic interval and generate random
* numbers with reference to poissonRate_, stored in poissonTrace_. Then the
* open-loop clients submit poissonTrace_[i] requests in the ith interval.
*
* Regarding the definition of open-loop and closed-loop submission, refer to
* ``evaluation method`` para of Sec 7.1 in our paper
* */
int poissonRate_;
/** The next requestId to be submitted */
std::atomic nextReqId_;
/** Requests whose requestId less or equal to committedReqId_ have been
* committed */
std::atomic committedReqId_;
/** Requests whose requestId less or equal to reclaimedReqId_ have been
* reclaimed (memory freed) */
std::atomic reclaimedReqId_;
std::vector poissonTrace_;
/** To communicate between OpenLoopSubmissionTd/CloseLoopSubmissionTd and
* LogTd The LogTd monitors the outstanding requests (i.e. which have been
* submitted but have not been committed). If some request has not been
* committed after a certain time, the LogTd will enqueue the request to
* retryQu, so that the OpenLoopSubmissionTd/CloseLoopSubmissionTd will
* retry them */
ConcurrentQueue retryQu_;
/** The addresses of proxies. Since we can have multiple proxies, and each
* proxies can have multiple shards, we use a two-dimensional vector to store
* the addresses, i.e., proxyAddrs[i][j] indicates the address of the jth
* shard of the ith proxy */
std::vector> proxyAddrs_;
/** To test commutativity, we generate different zipfian workloads and write
* ratios, i.e., we generate random numbers following the zipfian
* distribution. These random numbers are stored in zipfianKeys_ and serve as
* the keys that will be written/read by requests */
std::vector zipfianKeys_;
float writeRatio_;
/** Those requests which have been submitted but not yet committed (key is the
* requestId)*/
ConcurrentMap outstandingRequests_;
/** Record the send time of the requests, together with retryTimeoutus_, to
* decide whether the request needes to be retried*/
ConcurrentMap outstandingRequestSendTime_;
/** Used by LogTd to monitor outstanding reuqests. If they cannot be committed
* within retryTimeoutUs_ (measured in macro-seconds), they should be retried
* **/
uint32_t retryTimeoutUs_;
/** To communicate between ProcessReplyTd and LogTd */
ConcurrentQueue logQu_;
/** Performance counters, to show how many requests are retried/committed */
uint32_t retryNumber_;
uint32_t committedNum_;
uint32_t fastCommitNum_;
uint32_t fastWriteNum_;
/** Stats */
std::vector hop3s;
std::vector hop4s;
std::vector totals;
/** Launch all the threads, only called once during the lifetime of the
* client*/
void LaunchThreads();
/** Functions whose names are ended with ``Td`` will be used to instantiate
* threads.
*
* For the client, there are mainly three worker threads running:
*
* (1) OpenLoopSubmissionTd/CloseLoopSubmissionTd submits requests. A client
* can be either open-loop client or closed-loop client, but cannot be both.
*
* (2) ProcessReplyTd receives and processes the reply messages, and handle
* the log information to LogTd
*
* (3) LogTd dumps logs and also monitors the oustanding requests. If the
* requests have not been committed after a certain time (retryTimeoutus_),
* then LogTd will ask OpenLoopSubmissionTd/CloseLoopSubmissionTd to resubmit
* this reuqest to proxies
* */
void ProcessReplyTd();
void OpenLoopSubmissionTd();
void CloseLoopSubmissionTd();
void LogTd();
/** The message handler to handle messages from proxies. The function is used
* to instantiate a replyHandler_ and registered to requestEP_ */
void ReceiveReply(MessageHeader* msgHdr, char* msgBuffer, Address* sender);
public:
/** Client accepts a config file, which contains all the necessary information
* to instantiate the object, then it can call Run method
* */
Client(const std::string& configFile = "../configs/nezha-client-config.yaml");
void Run();
void Terminate();
~Client();
/** For debug */
uint64_t lastCommittedReqId_;
std::vector ls;
};
} // namespace nezha
================================================
FILE: client/client_config.h
================================================
#include
#include
#include
#include
#include
struct ClientConfig {
int clientId;
std::string clientIp;
int endpointType;
int requestPort;
uint32_t proxyMaxOwd;
int proxyReplyPortBase;
bool isOpenLoop;
int poissonRate;
uint32_t durationSec;
int keyNum;
double skewFactor;
double writeRatio;
int requestRetryTimeUs;
int proxyRequestPortBase;
std::vector proxyIps;
int proxyShardNum;
// Parses yaml file configFilename and fills in fields of ProxyConfig
// accordingly. Returns an error message or "" if there are no errors.
std::string parseConfig(std::string configFilename) {
YAML::Node config;
try {
config = YAML::LoadFile(configFilename);
} catch (const YAML::BadFile& e) {
return "Error loading config file:" + e.msg + ".";
}
LOG(INFO) << "Using config:\n " << config;
std::string key; // Keep track of current key for better error messages
try {
key = "client-id";
clientId = config[key].as();
key = "client-ip";
clientIp = config[key].as();
key = "endpoint-type";
endpointType = config[key].as();
key = "request-port";
requestPort = config[key].as();
key = "is-openloop";
isOpenLoop = config[key].as();
key = "poisson-rate";
poissonRate = config[key].as();
key = "duration-sec";
durationSec = config[key].as();
key = "key-num";
keyNum = config[key].as();
key = "skew-factor";
skewFactor = config[key].as();
key = "write-ratio";
writeRatio = config[key].as();
key = "request-retry-time-us";
requestRetryTimeUs = config[key].as();
key = "proxy-ips";
for (uint32_t i = 0; i < config[key].size(); i++) {
proxyIps.push_back(config[key][i].as());
}
key = "proxy-shards";
proxyShardNum = config[key].as();
key = "proxy-request-port-base";
proxyRequestPortBase = config[key].as();
return "";
} catch (const YAML::BadConversion& e) {
if (config[key]) {
return "Error parsing config field " + key + ": " + e.msg + ".";
} else {
return "Error parsing config field " + key + ": key not found.";
}
} catch (const std::exception& e) {
return "Error parsing config field " + key + ": " + e.what() + ".";
}
}
};
================================================
FILE: client/client_run.cc
================================================
#include "client/client.h"
DEFINE_string(config, "nezhav2/config/nezha-client-config-0.yaml", "The config file for the client");
nezha::Client* client = NULL;
void Terminate(int para) {
client->Terminate();
}
int main(int argc, char* argv[]) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
google::InitGoogleLogging(argv[0]);
FLAGS_logtostderr = 1;
signal(SIGINT, Terminate);
client = new nezha::Client(FLAGS_config);
client->Run();
delete client;
}
================================================
FILE: configs/dist/nezha-client-config.yaml
================================================
---
print-config: true
proxy-info:
proxy-ips:
- "10.128.2.13"
proxy-shards: 1
request-port-base: 32000
client-info:
client-id: 1
client-ip: "10.128.2.14"
request-port: 32912
is-openloop: true
poisson-rate: 10 # it means the client sends x reqs/10ms on average
duration-sec: 60 # it means the duration of the client runs (second)
key-num: 1000000 # when key-num is 1, it means there is no commutativity optimization
skew-factor: 0.5 # 0-0.99 The higher the zipfian factor is, the more skewed the workload is
request-retry-time-us: 10000 # After the request is submitted, if we cannot get the response after such long time, then we will retry
================================================
FILE: configs/dist/nezha-proxy-config.yaml
================================================
---
print-config: true
# Replica Info
replica-info:
replica-ips:
- "10.128.2.10"
- "10.128.2.11"
- "10.128.2.12"
receiver-shards: 1 # The number of threads to receive threads
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
initial-owd: 80 # The initial one-way delay (us) between replicas and proxies
# Proxy Info
proxy-info:
proxy-id: 1
proxy-ip: "10.128.2.13"
shard-num: 1
request-port-base: 32000
reply-port-base: 33000
================================================
FILE: configs/dist/nezha-replica-config-0.yaml
================================================
---
print-config: true
replica-ips:
- "10.128.2.10"
- "10.128.2.11"
- "10.128.2.12"
replica-id: 0
receiver-shards: 1 # The number of threads to receive threads
process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance
reply-shards: 1 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-max-batch: 30
request-transfer-max-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # when commutativity is enabled for test, we need the specified key number to detect commutativity
owd-estimation-window: 1000 # we use a sliding window to get moving median of one-way delays, here is the length of the window
================================================
FILE: configs/dist/nezha-replica-config-1.yaml
================================================
---
print-config: true
replica-ips:
- "10.128.2.10"
- "10.128.2.11"
- "10.128.2.12"
replica-id: 1
receiver-shards: 1 # The number of threads to receive threads
process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance
reply-shards: 1 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-max-batch: 30
request-transfer-max-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # when commutativity is enabled for test, we need the specified key number to detect commutativity
owd-estimation-window: 1000 # we use a sliding window to get moving median of one-way delays, here is the length of the window
================================================
FILE: configs/dist/nezha-replica-config-2.yaml
================================================
---
print-config: true
replica-ips:
- "10.128.2.10"
- "10.128.2.11"
- "10.128.2.12"
replica-id: 2
receiver-shards: 1 # The number of threads to receive threads
process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance
reply-shards: 1 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-max-batch: 30
request-transfer-max-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # when commutativity is enabled for test, we need the specified key number to detect commutativity
owd-estimation-window: 1000 # we use a sliding window to get moving median of one-way delays, here is the length of the window
================================================
FILE: configs/dist/nezha-replica-config.yaml
================================================
---
print-config: true
replica-ips:
- "10.128.2.10"
- "10.128.2.11"
- "10.128.2.12"
replica-id: 0
receiver-shards: 1 # The number of threads to receive threads
process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance
reply-shards: 1 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-max-batch: 30
request-transfer-max-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # when commutativity is enabled for test, we need the specified key number to detect commutativity
owd-estimation-window: 1000 # we use a sliding window to get moving median of one-way delays, here is the length of the window
================================================
FILE: configs/local/nezha-client-config.yaml
================================================
---
client-id: 1
client-ip: "127.0.0.5"
endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
request-port: 32912
is-openloop: true
poisson-rate: 1 # it means the client sends x reqs/10ms on average (should be larger than 10, otherwise, the submission rate is not accurate)
duration-sec: 60 # it means the duration of the client runs (second)
key-num: 1000000 # when key-num is 1, it means there is no commutativity optimization
skew-factor: 0.5 # 0-0.99 The higher the zipfian factor is, the more skewed the workload is
write-ratio: 0.5 # 0-1, the ratio of write requests
request-retry-time-us: 100000 # After the request is submitted, if we cannot get the response after such long time, then we will retry
# proxy info
proxy-ips:
- "127.0.0.4"
proxy-shards: 1
proxy-request-port-base: 32000
================================================
FILE: configs/local/nezha-proxy-config.yaml
================================================
---
# Proxy Info
proxy-endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
proxy-id: 1
proxy-ip: "127.0.0.4"
proxy-shard-num: 1
proxy-max-owd: 200
proxy-request-port-base: 32000
proxy-reply-port-base: 33000
# Replica Info
replica-ips:
- "127.0.0.1"
- "127.0.0.2"
- "127.0.0.3"
replica-receiver-shards: 1 # The number of threads to receive threads
replica-receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
replica-initial-owd: 80 # The initial one-way delay (us) between replicas and proxies
================================================
FILE: configs/local/nezha-replica-config-0.yaml
================================================
---
print-config: true
replica-ips:
- "127.0.0.1"
- "127.0.0.2"
- "127.0.0.3"
endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
replica-id: 0
receiver-shards: 1 # The number of threads to receive threads
record-shards: 1 # The number of threads to record requests in the global concurrent map
track-shards: 1 # The number of threads to record synced log entries
reply-shards: 2 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
index-sync-period-us: 50 # For every period, the leader replica will send a index sync message to followers, serves for slow path and heartbeat
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-batch: 30
request-key-transfer-batch: 60
request-transfer-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # When commutativity is enabled for test, we need the specified key number to detect commutativity
moving-percentile: 0.50 # the percentile used to estimate owd
owd-estimation-window: 1000 # We use a sliding window to get moving median of one-way delays, here is the length of the window
reclaim-timeout-ms: 5000 # To save memory, those requests who enter the late buffer will not stay there forever: if they have stayed for so long, they will be reclaimed. Similary, for unsynced log entries, if (1) they have been kept for more than reclaim-timeout-ms (2) they are not used by the worker threads; then they will be reclaimed
================================================
FILE: configs/local/nezha-replica-config-1.yaml
================================================
---
print-config: true
replica-ips:
- "127.0.0.1"
- "127.0.0.2"
- "127.0.0.3"
endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
replica-id: 1
receiver-shards: 1 # The number of threads to receive threads
record-shards: 1 # The number of threads to record requests in the global concurrent map
track-shards: 1 # The number of threads to record synced log entries
reply-shards: 2 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
index-sync-period-us: 50 # For every period, the leader replica will send a index sync message to followers, serves for slow path and heartbeat
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-batch: 30
request-key-transfer-batch: 60
request-transfer-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # When commutativity is enabled for test, we need the specified key number to detect commutativity
moving-percentile: 0.50 # the percentile used to estimate owd
owd-estimation-window: 1000 # We use a sliding window to get moving median of one-way delays, here is the length of the window
reclaim-timeout-ms: 5000 # To save memory, those requests who enter the late buffer will not stay there forever: if they have stayed for so long, they will be reclaimed. Similary, for unsynced log entries, if (1) they have been kept for more than reclaim-timeout-ms (2) they are not used by the worker threads; then they will be reclaimed
================================================
FILE: configs/local/nezha-replica-config-2.yaml
================================================
---
print-config: true
replica-ips:
- "127.0.0.1"
- "127.0.0.2"
- "127.0.0.3"
endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
replica-id: 2
receiver-shards: 1 # The number of threads to receive threads
record-shards: 1 # The number of threads to record requests in the global concurrent map
track-shards: 1 # The number of threads to record synced log entries
reply-shards: 2 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
index-sync-period-us: 50 # For every period, the leader replica will send a index sync message to followers, serves for slow path and heartbeat
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-batch: 30
request-key-transfer-batch: 60
request-transfer-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # When commutativity is enabled for test, we need the specified key number to detect commutativity
moving-percentile: 0.50 # the percentile used to estimate owd
owd-estimation-window: 1000 # We use a sliding window to get moving median of one-way delays, here is the length of the window
reclaim-timeout-ms: 5000 # To save memory, those requests who enter the late buffer will not stay there forever: if they have stayed for so long, they will be reclaimed. Similary, for unsynced log entries, if (1) they have been kept for more than reclaim-timeout-ms (2) they are not used by the worker threads; then they will be reclaimed
================================================
FILE: configs/nezha-client-config-template.yaml
================================================
---
print-config: true
proxy-info:
proxy-ips:
- "127.0.0.4"
proxy-shards: 12
request-port-base: 32000
client-info:
endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
client-id: 1
client-ip: "127.0.0.5"
request-port: 32912
is-openloop: true
poisson-rate: 60 # it means the client sends x reqs/10ms on average (should be larger than 10, otherwise, the submission rate is not accurate)
duration-sec: 60 # it means the duration of the client runs (second)
key-num: 1000000 # when key-num is 1, it means there is no commutativity optimization
skew-factor: 0.5 # 0-0.99 The higher the zipfian factor is, the more skewed the workload is
write-ratio: 0.5 # 0-1, the ratio of write requests
request-retry-time-us: 100000 # After the request is submitted, if we cannot get the response after such long time, then we will retry
================================================
FILE: configs/nezha-proxy-config-template.yaml
================================================
---
print-config: true
# Replica Info
replica-info:
replica-ips:
- "127.0.0.1"
- "127.0.0.2"
- "127.0.0.3"
receiver-shards: 2 # The number of threads to receive threads
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
initial-owd: 80 # The initial one-way delay (us) between replicas and proxies
# Proxy Info
proxy-info:
endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
proxy-id: 1
proxy-ip: "127.0.0.4"
shard-num: 12
max-owd: 200
request-port-base: 32000
reply-port-base: 33000
================================================
FILE: configs/nezha-replica-config-template.yaml
================================================
---
print-config: true
replica-ips:
- "127.0.0.1"
- "127.0.0.2"
- "127.0.0.3"
endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
replica-id: 0
receiver-shards: 2 # The number of threads to receive threads
record-shards: 1 # The number of threads to record requests in the global concurrent map
track-shards: 1 # The number of threads to record synced log entries
process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance
reply-shards: 3 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
index-sync-period-us: 50 # For every period, the leader replica will send a index sync message to followers, serves for slow path and heartbeat
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-batch: 30
request-key-transfer-batch: 60
request-transfer-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # When commutativity is enabled for test, we need the specified key number to detect commutativity
moving_percentile: 0.90 # the percentile used to estimate owd
owd-estimation-window: 1000 # We use a sliding window to get moving median of one-way delays, here is the length of the window
reclaim-timeout-ms: 5000 # To save memory, those requests who enter the late buffer will not stay there forever: if they have stayed for so long, they will be reclaimed. Similary, for unsynced log entries, if (1) they have been kept for more than reclaim-timeout-ms (2) they are not used by the worker threads; then they will be reclaimed
================================================
FILE: docs/Nezha.tla
================================================
`^\textbf{\large N TLA+ Specification}\\^'
------------------------------ MODULE Nezha ----------------------------------
EXTENDS Naturals, TLC, FiniteSets, Sequences
--------------------------------------------------------------------------------
(* `^\textbf{\large Bounds for Model Check [Configurable]}^' *)
\* Time Range [Configurable]
MaxTime == 3
\* Each client is only allowed to submit MaxReqNum requests [Configurable]
\* In the specification, we will only consider two roles, client and replicas
\* (i.e. it can be considered as co-locating one proxy with one client)
\* For the proxy-based design, we just need to replace client with proxy,
\* and then the specification describes the interaction between proxy and replicas
MaxReqNum == 1
\* The leader is only allowed to crash when the view < MaxViews [Configurable]
MaxViews == 3
\* These variables are used to implment at-most-once primitives
\* i.e. The variables record the messages processed by Replicas/Clients, so
\* that the Replicas/Clients will not process twice
VARIABLE vReplicaProcessed, \* Messages that have been processed by replicas
vClientProcessed \* Messages that have been processed by clients
VARIABLE DebugAction
(* `^\textbf{\large Constants}^' *)
\* The set of replicas and an ordering of them
CONSTANTS Replicas, ReplicaOrder, Clients, LatencyBounds
ASSUME IsFiniteSet(Replicas)
ASSUME ReplicaOrder \in Seq(Replicas)
F == (Cardinality(Replicas) - 1) \div 2
ceilHalfF == IF (F \div 2) * 2 = F THEN F \div 2 ELSE (F+1) \div 2
floorHalfF == F \div 2
QuorumSize == F + 1
FastQuorumSize == F + ceilHalfF + 1
RecoveryQuorumSize == ceilHalfF + 1
FastQuorums == {R \in SUBSET(Replicas) : Cardinality(R) >= FastQuorumSize }
Quorums == {R \in SUBSET(Replicas) : Cardinality(R) * 2 > Cardinality(Replicas)}
\* Replica Statuses
StNormal == 1
StViewChange == 2
StRecovering == 3
\* Message Types
MClientRequest == 1 \* Sent by client to replicas
MFastReply == 2 \* Fast Reply Message
MSlowReply == 3 \* Slow Reply Message
MLogIndex == 4 \* LogIndex
MLogEntry == 5 \* Log entry, different from index, it includes command field, which can be large in practice
MIndexSync == 6 \* Sync message during the index sync process
MMissEntryRequest == 7 \* Sent by followers once they fail to find the entry on itself
MMissEntryReply == 8 \* Response to MMissEntryRequest, providing the missing entries
MViewChangeReq == 9 \* Sent when leader/sequencer failure detected
MViewChange == 10 \* Sent to ACK view change
MStartView == 11 \* Sent by new leader to start view
\* The following messages are mainly used for periodic sync
\* Just as described in NOPaxos, it is an optional optimization to enable fast recovery after failure
MSyncPrepare == 12 \* Sent by the leader to ensure log durability
MSyncRep == 13 \* Sent by followers as ACK
MSyncCommit == 14 \* Sent by leaders to indicate stable log
\* The following messages are mainly used for replica recovery
MCrashVectorReq == 15
MCrashVectorRep == 16
MRecoveryReq == 17
MRecoveryRep == 18
MStateTransferReq == 19
MStateTransferRep == 20
(*
`^\textbf{Message Schemas}^'
ViewIDs == [ leaderNum |-> n \in (1..) ]
\* uniquely identifies one request on one replica
\* But across replicas, the same may have different deadlines
\* (the leader may modify the deadline to make the request eligible to enter the early-buffer)
\* so uniquely identifes one request across replicas
ClientRequest
[ mtype |-> MClientRequest,
sender |-> c \in Clients,
dest |-> r \in Replicas,
requestID |-> i \in (1..),
command |-> "",
s |-> t \in (1..MaxTime),
l |-> l \in (1..MaxBound)
]
\* logSlotNum is not necessary and it is not described in the paper
\* Here we include logSlotNum in FastReply and SlowReply messages
\* to facilitate the check of Linearizability invariant
FastReply
[ mtype |-> MFastReply,
sender |-> r \in Replicas,
dest |-> c \in Clients,
viewID |-> v \in ViewIDs,
requestID |-> i \in (1..vClientReqNum)
hash |-> [
log |-> vLogs[1..n],
cv |-> crashVector
]
deadline |-> i \in (1..MaxTime+MaxBound),
logSlotNum |-> n \in (1..)
]
SlowReply
[ mtype |-> MSlowReply,
sender |-> r \in Replicas,
dest |-> c \in Clients,
viewID |-> v \in ViewIDs,
requestID |-> i \in (1..vClientReqNum)
logSlotNum |-> n \in (1..)
]
LogIndex
[ mtype |-> MLogIndex,
clientID |-> c \in Clients,
requestID |-> i \in (1..vClientReqNum),
deadline |-> i \in (1..MaxTime+MaxBound),
]
LogEntry
[ mtype |-> MLogEntry,
clientID |-> c \in Clients,
requestID |-> i \in (1..vClientReqNum),
deadline |-> i \in (1..MaxTime+MaxBound),
command |-> ""
]
IndexSync
[ mtype |-> MIndexSync,
sender |-> r \in Replicas,
dest |-> c \in Clients,
viewID |-> v \in ViewIDs,
logindcies |-> index \in vLogs[leaderIdx]
]
MMissEntryRequest
[ mtype |-> MMissEntryRequest,
sender |-> r \in Replicas,
dest |-> d \in Replicas,
viewID |-> v \in ViewIDs,
miss |-> {log indices}
]
MMissEntryRequest
[ mtype |-> MMissEntryReply,
sender |-> r \in Replicas,
dest |-> d \in Replicas,
viewID |-> v \in ViewIDs,
entries |-> {log entries}
]
ViewChangeReq
[ mtype |-> MViewChangeReq,
sender |-> r \in Replicas,
dest |-> r \in Replicas,
viewID |-> v \in ViewIDs,
cv |-> crash vector
]
ViewChange
[ mtype |-> MViewChange,
sender |-> r \in Replicas,
dest |-> r \in Replicas,
viewID |-> v \in ViewIDs,
lastNormal |-> v \in ViewIDs,
log |-> l \in vLogs[1..n],
cv |-> crash vector
]
StartView
[ mtype |-> MStartView,
dest |-> r \in Replicas,
viewID |-> v \in ViewIDs,
log |-> l \in vLogs[1..n],
cv |-> crash vector
]
SyncPrepare
[ mtype |-> MSyncPrepare,
dest |-> r \in Replicas,
sender |-> r \in Replicas,
viewID |-> v \in ViewIDs,
log |-> l \in vLogs[1..n] ]
SyncRep
[ mtype |-> MSyncRep,
dest |-> r \in Replicas,
sender |-> r \in Replicas,
viewID |-> v \in ViewIDs,
logSlotNumber |-> n \in (1..) ]
SyncCommit
[ mtype |-> MSyncCommit,
dest |-> r \in Replicas,
sender |-> r \in Replicas,
viewID |-> v \in ViewIDs,
log |-> l \in vLogs[1..n] ]
CrashVectorReq
[ mtype |-> MCrashVectorReq,
sender |-> r \in Replicas,
dest |-> r \in Replicas,
nonce |-> nonce
]
CrashVectorRep
[ mtype |-> MCrashVectorRep,
sender |-> r \in Replicas,
dest |-> r \in Replicas,
nonce |-> nonce,
cv |-> vector of counters
]
RecoveryReq
[ mtype |-> MRecoveryReq,
sender |-> r \in Replicas,
dest |-> r \in Replicas,
cv |-> vector of counters
]
RecoveryRep
[ mtype |-> MRecoveryRep,
sender |-> r \in Replicas,
dest |-> r \in Replicas,
viewID |-> v \in ViewIDs,
cv |-> vector of counters
]
StateTransferReq
[ mtype |-> MStateTransferReq,
sender |-> r \in Replicas,
dest |-> r \in Replicas,
cv |-> vector of counters
]
StateTransferRep
[ mtype |-> MStateTransferRep,
sender |-> r \in Replicas,
dest |-> r \in Replicas,
viewID |-> v \in ViewIDs,
log |-> l \in vLogs[1..n] ],
cv |-> vector of counters
]
*)
--------------------------------------------------------------------------------
(* `^\textbf{\large Variables}^' *)
\* `^\textbf{Network State}^'
VARIABLE messages \* Set of all messages sent
networkVars == << messages >>
InitNetworkState == messages = {}
\* Used as a dummy value
NULLLog == [ deadline |-> 0,
clientID |-> 0,
requestID |-> 0
]
\* `^\textbf{Replica State}^'
VARIABLES vLog, \* Log of values
vEarlyBuffer, \* The early buffer to hold request,
\* and release it after clock passes its deadline (s+l)
vReplicaStatus, \* One of StNormal, StViewChange, StRecovering
vViewID, \* Current viewID replicas recognize
vReplicaClock, \* Current Time of the replica
vLastNormView, \* Last views in which replicas had status StNormal
vViewChanges, \* Used for logging view change votes
vSyncPoint, \* Latest synchronization point,
\* to which the replica state (vLog) is consistent with the leader.
vLateBuffer, \* The late buffer Used to store the requests
\* which are not eligible to enter vEarlyBuffer
vTentativeSync, \* Used by leader to mark current syncPrepare point (during periodic sync process)
\* (Actually, vSyncPoint and vTentativeSync can be merged into one Var
\* However, we decouple them to make the spec easy to understand)
vSyncReps, \* Used for logging sync reps at leader
vCommitPoint, \* Different from vSyncPoint,
\* vCommitPoint indicates that the logs before this point has been replicated to majority
\* So followers can safely execute requests (log entries) up to vCommitPoint
\* Refer to ``Acceleration of Recovery" para in Sec 6
vUUIDCounter, \* Locally unique string (for CrashVectorReq)
vCrashVector, \* CrashVector, initialized as all-zero vector
vCrashVectorReps,\* CrashVectorRep Set
vRecoveryReps \* RecoveryRep Set
replicaVars == << vLog, vEarlyBuffer,
vViewID, vReplicaClock,
vLastNormView, vViewChanges,vReplicaStatus,
vSyncPoint, vLateBuffer,
vTentativeSync, vSyncReps, vCommitPoint,
vUUIDCounter, vCrashVector,
vCrashVectorReps, vRecoveryReps>>
InitReplicaState ==
/\ vLog = [ r \in Replicas |-> << >> ]
/\ vEarlyBuffer = [ r \in Replicas |-> {} ]
/\ vViewID = [ r \in Replicas |-> 1 ] \* 0 should also be okay
/\ vReplicaClock = [ r \in Replicas |-> 1 ]
/\ vLastNormView = [ r \in Replicas |-> 1 ]
/\ vViewChanges = [ r \in Replicas |-> {} ]
/\ vReplicaStatus = [ r \in Replicas |-> StNormal ]
/\ vSyncPoint = [ r \in Replicas |-> 0 ]
/\ vLateBuffer = [ r \in Replicas |-> {} ]
/\ vTentativeSync = [ r \in Replicas |-> 0 ]
/\ vSyncReps = [ r \in Replicas |-> {} ]
/\ vCommitPoint = [ r \in Replicas |-> 0 ]
/\ vCrashVector = [ r \in Replicas |-> [ rr \in Replicas |-> 0] ]
/\ vCrashVectorReps= [ r \in Replicas |-> {} ]
/\ vRecoveryReps = [ r \in Replicas |-> {} ]
/\ vUUIDCounter = [ c \in Replicas |-> 0 ]
\* `^\textbf{Client State}^'
VARIABLES vClientClock, \* Current Clock Time of the client
vClientReqNum \* The number of requests that have been sent by this client
InitClientState ==
/\ vClientClock = [ c \in Clients |-> 1 ]
/\ vClientReqNum = [ c \in Clients |-> 0 ]
clientVars == << vClientClock, vClientReqNum >>
\* `^\textbf{Set of all vars}^'
vars == << networkVars, replicaVars, clientVars >>
\*\* `^\textbf{Initial state}^'
Init == /\ InitNetworkState
/\ InitReplicaState
/\ InitClientState
/\ vReplicaProcessed = [ r \in Replicas |-> {} ]
/\ vClientProcessed = [c \in Clients |-> {}]
/\ DebugAction = <<"Init", "">>
--------------------------------------------------------------------------------
(* `^\textbf{\large Helpers}^' *)
NumofReplicas(status) == Cardinality({ r \in Replicas: vReplicaStatus[r] = status })
DuplicateRep(ReplySet,m) == m.sender \in { mm.sender : mm \in ReplySet }
Pick(S) == CHOOSE s \in S : TRUE
\* Convert a Set to Sequence
RECURSIVE Set2Seq(_)
Set2Seq(S) == IF Cardinality(S) = 0 THEN <<>>
ELSE
LET
x == CHOOSE x \in S : TRUE
IN
<> \o Set2Seq(S \ {x})
\* Convert a Sequence to Set
Seq2Set(seq) == { seq[i] : i \in DOMAIN seq }
Max(S) == CHOOSE x \in S : \A y \in S : x >= y
Min(S) == CHOOSE x \in S : \A y \in S : x <= y
\* `^\textbf{View ID Helpers}^'
LeaderID(viewID) == (viewID % Len(ReplicaOrder)) + (IF viewID >= Len(ReplicaOrder) THEN 1 ELSE 0)
Leader(viewID) == ReplicaOrder[LeaderID(viewID)] \* remember <<>> are 1-indexed
\* `^\textbf{Log Manipulation Helpers}^'
\* The order of 2 log entries are decided by the tuple
\* Usually, deadline makes the two entries comparable
\* When 2 different entries have the same deadline, the tie is broken with clientID
\* Further, the tie is broken is requestID
\* (unnecessary if we only allow client to submit one request at one tick)
EntryLeq(l1, l2) == /\ l1.deadline <= l2.deadline
/\ l1.clientID <= l2.clientID
/\ l1.requestID <= l2.requestID
EntryEq(l1, l2) == /\ l1.deadline = l2.deadline
/\ l1.clientID = l2.clientID
/\ l1.requestID = l2.requestID
EntryLessThan(l1, l2) == /\ EntryLeq(l1, l2)
/\ ~(EntryEq(l1, l2))
\* Find entry in one replica's log ( can uniquely identify the log entry)
\* We do not check deadline, because the leader may have modified the request's deadline
\* Return 0 when we fail to find it (remember Sequence is 1-indexed in TLA+, so 0 can serve as a dummy value)
FindEntry(clientID, reqID, log) ==
LET
entryIndexSet == { i \in 1..Len(log): /\ log[i].clientID = clientID
/\ log[i].reqID = reqID }
IN
IF Cardinality(entryIndexSet) = 0 THEN
0
ELSE
Pick(entryIndexSet)
SortLogSeq(seq) == SortSeq(seq, LAMBDA x, y: EntryLessThan(x, y) )
\* Given a set of logs, return the sorted log list
GetSortLogSeq(S) == LET
seq == Set2Seq(S)
IN
SortLogSeq(seq)
(* Merge logs, first put all log items together, deduplicated (i.e. UNION them into a set).
Then, do filtering and only keep those that have appeared in at least
`^\left \lceil{f/2}\right \rceil +1^' replicas. *)
CountVotes(logll, x) == Cardinality({ logSet \in logll : x \in logSet })
MergeUnSyncLogs(unSyncedLogs, lastSyncedLog) ==
LET
unSyncedLogSet == UNION unSyncedLogs
votedLogSet == {x \in unSyncedLogSet :
/\ EntryLessThan(lastSyncedLog, x)
/\ CountVotes(unSyncedLogs, x) >= RecoveryQuorumSize}
IN
GetSortLogSeq(votedLogSet)
\* `^\textbf{Network Helpers}^'
\* Add a message to the network
Send(ms) == messages' = messages \cup ms
\* Convert the request format to a log format (by summing up s and l to get deadline)
Req2Log(req) == [ mtype |-> MLogEntry,
deadline |-> req.s + req.l,
clientID |-> req.sender,
requestID |-> req.requestID,
command |-> req.command
]
\* Index does not need to include command field, which is the body of the request/log, and can be very large
GetLogIndex(entry) == [
mtype |-> MLogIndex,
deadline |-> entry.deadline,
clientID |-> entry.clientID,
requestID |-> entry.requestID
]
GetLogIndexFromReply(reply) == [
mtype |-> MLogIndex,
deadline |-> reply.deadline,
clientID |-> reply.dest,
requestID |-> reply.requestID
]
IndexEq(index, msg) == /\ index.deadline = msg.deadline
/\ index.clientID = msg.clientID
/\ index.requestID = msg.requestID
\* Add local time to the message (for easy debug)
Msg2RLog(msg, r) == msg @@ [tl |-> vReplicaClock[r]]
LastLog(logList) == IF Len(logList) = 0 THEN NULLLog ELSE logList[Len(logList)]
MergeCrashVector(cv1, cv2)== [ r \in Replicas |-> Max({cv1[r], cv2[r]}) ]
CheckCrashVector(m, r) ==
IF m.cv[m.sender] < vCrashVector[r][m.sender] THEN
FALSE \* Potential stray message
ELSE
vCrashVector' = [ vCrashVector EXCEPT ![r] = MergeCrashVector(m.cv, vCrashVector[r])]
FilterStrayMessage(MSet, cv) == {m \in MSet : m.cv[m.sender] >= cv[m.sender] }
--------------------------------------------------------------------------------
(* `^\textbf{\large Message Handlers and Actions }^' *)
\* `^\textbf{Client action}^'
\* Client c sends a request
\* We assume client can only send one request in one tick of time
\* If time has reached the bound, this client cannot send request any more
ClientSendRequest(c) == /\ vClientClock[c] < MaxTime
/\ vClientReqNum[c] < MaxReqNum
/\ Send({[ mtype |-> MClientRequest,
sender |-> c, \* clientID
requestID |-> vClientReqNum[c] + 1, \* requestID
command |-> "",
s |-> vClientClock[c], \* submission time
l |-> LatencyBounds[c], \* latency bound
dest |-> r
]: r \in Replicas })
/\ vClientClock' = [ vClientClock EXCEPT ![c] = vClientClock[c] + 1 ]
/\ vClientReqNum' = [ vClientReqNum EXCEPT ![c] = vClientReqNum[c] +1 ]
/\ UNCHANGED << replicaVars >>
Duplicate(entry, logSet) ==
LET
findSet == {x \in logSet : /\ x.clientID = entry.clientID
/\ x.requestID = entry.requestID }
IN
Cardinality(findSet) > 0
\* Replica r receives MClientRequest, m
HandleClientRequest(r, m) ==
LET
mlog == Req2Log(m)
IN
\* If the request is duplicate, it will no longer be appended to the log
\* Replicas simply reply the previous execution result of this request
\* (we do not model execution in this spec)
/\ ~Duplicate(mlog, Seq2Set(vLog[r]) \union vEarlyBuffer[r] )
/\ vReplicaStatus[r] = StNormal
\* The request can enter the early buffer
/\ \/ /\ EntryLessThan(LastLog(vLog[r]), mlog)
/\ vEarlyBuffer' = [
vEarlyBuffer EXCEPT ![r] =vEarlyBuffer[r] \cup { mlog }
]
/\ UNCHANGED << networkVars, clientVars,
vLog, vViewID, vReplicaClock,
vLastNormView, vViewChanges,vReplicaStatus,
vSyncPoint, vLateBuffer,
vTentativeSync, vSyncReps, vCommitPoint,
vUUIDCounter, vCrashVector,
vCrashVectorReps, vRecoveryReps >>
\* (1) Followers' early buffers do not accept the request
\* if its deadline is smaller than previously appended (last released) entry,
\* so followers directly put the request into the late buffer
\* (2) Leader modifies its deadline to be larger than the last released entry
\* so as to make it eligible for entering the early buffer
\/ /\ EntryLessThan(mlog, LastLog(vLog[r]))
/\ IF r = Leader(vViewID[r]) THEN \* this replica is the leader in the current view
/\ vEarlyBuffer' = [
vEarlyBuffer EXCEPT ![r] =vEarlyBuffer[r] \cup {[
mtype |-> MLogEntry,
clientID |-> mlog.clientID,
requestID |-> mlog.requestID,
deadline |-> LastLog(vLog[r]).deadline + 1,
command |-> mlog.command
]}
]
/\ UNCHANGED << networkVars, clientVars,
vLog, vViewID, vReplicaClock,
vLastNormView, vViewChanges,vReplicaStatus,
vSyncPoint, vLateBuffer,
vTentativeSync, vSyncReps, vCommitPoint,
vUUIDCounter, vCrashVector,
vCrashVectorReps, vRecoveryReps >>
ELSE \* this replica is a follower in the current view
/\ vLateBuffer' = [
vLateBuffer EXCEPT ![r] =vLateBuffer[r] \cup { mlog }
]
/\ UNCHANGED << networkVars, clientVars,
vLog, vEarlyBuffer, vViewID, vReplicaClock,
vLastNormView, vViewChanges,vReplicaStatus,
vSyncPoint, vTentativeSync,
vSyncReps, vCommitPoint,
vUUIDCounter, vCrashVector,
vCrashVectorReps, vRecoveryReps >>
\* Release relevant requests from vEarlyBuffer and append to vLog,
\* and then send a fast reply
FlushEarlyBuffer(r) ==
LET
validLogSet == {x \in vEarlyBuffer[r]:
/\ x.deadline < vReplicaClock[r] \* < rather than <=
/\ EntryLessThan(LastLog(vLog[r]), x) }
validLogs == GetSortLogSeq(validLogSet)
newLogStart == Len(vLog[r]) + 1
IN
/\ vLog' = [vLog EXCEPT ![r] = vLog[r] \o validLogs ]
/\ vEarlyBuffer' = [ vEarlyBuffer EXCEPT ![r]
= {x \in vEarlyBuffer[r]: x.deadline >= vReplicaClock[r] } ] \* >= rather than >
/\ Send({[ mtype |-> MFastReply,
sender |-> r,
dest |-> vLog'[r][i].clientID,
viewID |-> vViewID[r],
requestID |-> vLog'[r][i].requestID,
hash |-> [
log |-> SubSeq(vLog'[r], 1, i),
cv |-> vCrashVector
],
deadline |-> vLog'[r][i].deadline,
logSlotNum |-> i
] : i \in newLogStart..Len(vLog'[r])})
/\ IF r = Leader(vViewID[r]) THEN
/\ vSyncPoint' = [ vSyncPoint EXCEPT ![r] = Len(vLog'[r]) ]
/\ UNCHANGED << clientVars, vViewID, vLastNormView, vViewChanges,
vReplicaStatus, vReplicaClock, vLateBuffer,
vTentativeSync, vSyncReps, vCommitPoint,
vUUIDCounter, vCrashVector,
vCrashVectorReps, vRecoveryReps >>
ELSE
UNCHANGED << clientVars, vViewID, vLastNormView, vViewChanges,
vReplicaStatus, vReplicaClock,
vSyncPoint, vLateBuffer,
vTentativeSync, vSyncReps, vCommitPoint,
vUUIDCounter, vCrashVector,
vCrashVectorReps, vRecoveryReps >>
\* Clock can be random value (RandomElement(1..MaxTime)),
\* because clock sync algorithm can give negative offset, or even fails
\* But Nezha depend on clock for performance but not for correctness
\* If the replica clock goes beyond MaxTime, it will stop processing
\* Since Clock is moved, then replicas can release relevant requests and append to logs
ReplicaClockMove(r) ==/\ IF vReplicaClock[r] < MaxTime THEN
vReplicaClock' = [
vReplicaClock EXCEPT ![r] = RandomElement(1..MaxTime)
]
ELSE
UNCHANGED vReplicaClock
/\ UNCHANGED << networkVars, clientVars,
vLog, vEarlyBuffer,vViewID,
vLastNormView, vViewChanges, vReplicaStatus,
vSyncPoint, vLateBuffer, vTentativeSync,
vSyncReps,vCommitPoint,
vUUIDCounter, vCrashVector,
vCrashVectorReps, vRecoveryReps >>
\* Client clock move does not change any other things
ClientClockMove(c) == /\ IF vClientClock[c] < MaxTime THEN
vClientClock' = [
vClientClock EXCEPT ![c] = RandomElement(1..MaxTime)
]
ELSE
UNCHANGED vClientClock
/\ UNCHANGED <>
--------------------------------------------------------------------------------
\* `^\textbf{\large Index Synchronization to Fix Set Inequality}^'
\* Leader replica r starts index synchronization
StartIndexSync(r) ==
LET
indices == { GetLogIndex(vLog[r][i]) : i \in 1..Len(vLog[r]) }
IN
/\ r = Leader(vViewID[r])
/\ vReplicaStatus[r] = StNormal
/\ Cardinality(indices) > 0 \* leader has log entries to sync
/\ Send({[ mtype |-> MIndexSync,
sender |-> r,
dest |-> d,
viewID |-> vViewID[r],
logindcies |-> indices ] : d \in Replicas })
/\ UNCHANGED << clientVars, replicaVars >>
GetSyncLogs(logSeq, indices) ==
LET
logSet == { l \in Seq2Set(logSeq) : \E index \in indices: EntryEq(index, l)}
IN
GetSortLogSeq(logSet)
GetUnSyncLogs(logSeq, lastSyncedLog) ==
LET
logSet == { l \in Seq2Set(logSeq) : EntryLessThan(lastSyncedLog, l) }
IN
GetSortLogSeq(logSet)
\* Replica r receives IndexSync message, m
HandleIndexSync(r, m) ==
/\ r /= Leader(vViewID[r])
/\ vReplicaStatus[r] = StNormal
/\ m.viewID = vViewID[r]
/\ m.sender = Leader(vViewID[r])
/\ vSyncPoint[r] < Len(m.logindcies)
/\ LET
entries == { vLog[r][i] : i \in 1..Len(vLog[r]) }
indices == { GetLogIndex(vLog[r][i]) : i \in 1..Len(vLog[r]) }
missedEntries == m.indices \ indices
IN
\* Missing some log entries -> Send MMissEntryRequest
IF Cardinality(missedEntries) > 0 THEN
/\ Send({[ mtype |-> MMissEntryRequest,
sender |-> r,
dest |-> d,
viewID |-> vViewID[r],
miss |-> missedEntries ] : d \in (Replicas \ {r} ) })
/\ UNCHANGED << vLog, vSyncPoint >>
\* No missing entries, update vLog and vSyncPoint, and send relevant slow replies
ELSE
LET
syncLogs == GetSyncLogs(vLog[r], indices)
unsyncLogs == GetUnSyncLogs(vLog[r], LastLog(syncLogs))
IN
/\ vLog' = [ vLog EXCEPT ![r] = syncLogs \o unsyncLogs ]
/\ vSyncPoint' = [ vSyncPoint EXCEPT ![r] = Len(syncLogs) ]
/\ Send({[ mtype |-> MSlowReply,
sender |-> r,
dest |-> vLog'[r][i].clientID,
viewID |-> vViewID[r],
requestID |-> vLog'[r][i].requestID,
logSlotNum |-> i ] : i \in (1..Len(syncLogs))})
/\ UNCHANGED << clientVars, vEarlyBuffer, vViewID, vReplicaClock,
vLastNormView, vViewChanges, vReplicaStatus,
vLateBuffer, vTentativeSync, vSyncReps, vCommitPoint,
vUUIDCounter, vCrashVector,
vCrashVectorReps, vRecoveryReps>>
FindEntries(log, indices)==
{ l \in Seq2Set(log) : \E x \in indices: IndexEq(l,x) }
\* Replica r receives a request from other replicas, asking for a missing log entry
HandleMissEntryRequest(r, m) ==
/\ m.viewID = vViewID[r]
/\ LET
findentries == FindEntries(vLog[r], m.miss)
IN
/\ Cardinality(findentries) > 0
/\ Send({[ mtype |-> MMissEntryReply,
sender |-> r,
dest |-> m.sender,
viewID |-> vViewID[r],
entries |-> findentries ]})
/\ UNCHANGED << clientVars, replicaVars >>
\* Replica r receives a reply from other replicas, providing the missing entries
HandleMissEntryReply(r, m) ==
/\ m.viewID = vViewID[r]
/\ LET
mergedSet == Seq2Set(vLog[r]) \union m.entries
IN
vLog' = [ vLog EXCEPT ![r] = GetSortLogSeq(mergedSet) ]
/\ UNCHANGED << networkVars, clientVars,
vEarlyBuffer,vViewID, vReplicaClock,
vLastNormView, vViewChanges, vReplicaStatus,
vSyncPoint, vLateBuffer,
vTentativeSync,vSyncReps, vCommitPoint,
vUUIDCounter, vCrashVector,
vCrashVectorReps, vRecoveryReps >>
--------------------------------------------------------------------------------
\* `^\textbf{\large Replica Rejoin}^'
\* Failed replica loses all states
StartReplicaFail(r) ==
/\ NumofReplicas(StRecovering) < F \* We assume at most F replicas can fail at the same time
/\ vReplicaStatus' = [ vReplicaStatus EXCEPT ![r] = StRecovering ]
/\ vLog' = [ vLog EXCEPT ![r] = <<>> ]
/\ vEarlyBuffer' = [ vEarlyBuffer EXCEPT ![r] = {} ]
/\ vViewID' = [vViewID EXCEPT![r] = 1 ]
/\ vLastNormView' = [ vLastNormView EXCEPT ![r] = 1 ]
/\ vViewChanges' = [ vViewChanges EXCEPT ![r] = {} ]
/\ vSyncPoint' = [ vSyncPoint EXCEPT ![r] = 0 ]
/\ vLateBuffer' = [ vLateBuffer EXCEPT ![r] = {} ]
/\ vTentativeSync' = [ vTentativeSync EXCEPT ![r] = 0 ]
/\ vSyncReps' = [ vSyncReps EXCEPT ![r] = {} ]
/\ vCommitPoint' = [ vCommitPoint EXCEPT ![r] = 0 ]
/\ vCrashVector' = [ vCrashVector EXCEPT ![r] = [ rr \in Replicas |-> 0] ]
/\ vCrashVectorReps' = [ vCrashVectorReps EXCEPT ![r] = {} ]
/\ vRecoveryReps' = [ vRecoveryReps EXCEPT ![r] = {} ]
/\ UNCHANGED << vReplicaClock, vUUIDCounter, clientVars, networkVars >>
\* Recovering replica starts recovery (by first sending CrashVectorReq)
StartReplicaRecovery(r) ==
/\ vReplicaStatus[r] = StRecovering
/\ vUUIDCounter' = [ vUUIDCounter EXCEPT ![r] = vUUIDCounter[r] + 1 ]
/\ Send({[ mtype |-> MCrashVectorReq,
sender |-> r,
dest |-> d,
nonce |-> vUUIDCounter'[r] ] : d \in Replicas})
/\ UNCHANGED << vLog, vEarlyBuffer, vViewID, vReplicaClock,
vLastNormView, vViewChanges,vReplicaStatus,
vSyncPoint, vLateBuffer,
vTentativeSync, vSyncReps, vCommitPoint,
vCrashVector, vCrashVectorReps, vRecoveryReps,
clientVars >>
HandleCrashVectorReq(r, m) ==
/\ vReplicaStatus[r] = StNormal
/\ Send({[ mtype |-> MCrashVectorRep,
sender |-> r,
dest |-> m.sender,
nonce |-> m.nonce,
cv |-> vCrashVector[r] ]})
/\ UNCHANGED << replicaVars, clientVars >>
HandleCrashVectorRep(r, m) ==
/\ vReplicaStatus[r] = StRecovering
/\ vUUIDCounter[r] = m.nonce
/\ Cardinality(vCrashVectorReps[r]) <= F
/\ ~DuplicateRep(vCrashVectorReps[r],m)
/\ vCrashVectorReps' = [ vCrashVectorReps EXCEPT ![r] = vCrashVectorReps[r] \cup {m} ]
/\ vCrashVector' = [ vCrashVector EXCEPT ![r] = MergeCrashVector(vCrashVector[r], m.cv) ]
/\ IF Cardinality(vCrashVectorReps') = F + 1 THEN \* got enough replies and can settle down cv
Send({[ mtype |-> MRecoveryReq,
sender |-> r,
dest |-> d,
nonce |-> m.nonce,
cv |-> vCrashVector'[r] ]: d \in Replicas })
ELSE
UNCHANGED << networkVars >>
/\ UNCHANGED <>
HandleRecoveryReq(r, m) ==
/\ vReplicaStatus[r] = StNormal
/\ vCrashVector' = [ vCrashVector EXCEPT ![r] = MergeCrashVector(vCrashVector[r], m.cv) ]
/\ Send({[ mtype |-> MRecoveryRep,
sender |-> r,
dest |-> m.sender,
viewID |-> vViewID[r],
cv |-> vCrashVector'[r] ]: d \in Replicas })
/\ UNCHANGED << vLog, vEarlyBuffer, vViewID, vReplicaClock,
vLastNormView, vViewChanges,vReplicaStatus,
vSyncPoint, vLateBuffer,
vTentativeSync, vSyncReps, vCommitPoint,
vUUIDCounter, vCrashVectorReps, vRecoveryReps,
clientVars >>
HandleRecoveryRep(r, m) ==
/\ vReplicaStatus[r] = StRecovering
/\ Cardinality(vRecoveryReps[r]) <= F
/\ ~DuplicateRep(vRecoveryReps[r], m.sender)
/\ CheckCrashVector(m, r)
(* `~
/\ vRecoveryReps' = [ vRecoveryReps EXCEPT
![r] = vRecoveryReps[r] \cup {m} ]
~'
*)
\* Note: After crash vector is updated, those previously accepted messages may also become stray message.
\* Those messages should also be filtered out.
/\ vRecoveryReps' = [ vRecoveryReps EXCEPT
![r] = FilterStrayMessage(vRecoveryReps[r] \cup {m}, vCrashVector'[r] ) ]
/\ IF Cardinality(vRecoveryReps') = F + 1 THEN \* got enough replies
LET
newView == Max({ mm.viewID : mm \in vRecoveryReps'[r] })
leaderId == newView % Cardinality(Replicas)
IN
Send({[ mtype |-> MStateTransferReq,
sender |-> r,
dest |-> leaderId,
cv |-> vCrashVector'[r] ]: d \in Replicas })
ELSE
UNCHANGED << networkVars >>
/\ UNCHANGED <>
HandleStateTransferReq(r, m) ==
/\ vReplicaStatus[r] = StNormal
/\ CheckCrashVector(m, r)
/\ Send({[ mtype |-> MStateTransferRep,
sender |-> r,
dest |-> m.sender,
log |-> vLog[r],
sp |-> vSyncPoint[r],
cp |-> vCommitPoint[r],
cv |-> vCrashVector'[r] ]})
/\ UNCHANGED << vLog, vEarlyBuffer, vViewID, vReplicaClock,
vLastNormView, vViewChanges,vReplicaStatus,
vSyncPoint, vLateBuffer,
vTentativeSync, vSyncReps, vCommitPoint,
vUUIDCounter, vCrashVectorReps, vRecoveryReps,
clientVars >>
HandleStateTransferRep(r, m) ==
/\ vReplicaStatus[r] = StRecovering
/\ CheckCrashVector(m, r)
/\ vLog' = [ vLog EXCEPT ![r] = m.log ]
/\ vSyncPoint' = [ vSyncPoint EXCEPT ![r] = m.sp ]
/\ vCommitPoint' = [ vCommitPoint EXCEPT ![r] = m.cp ]
/\ vViewID' = [ vViewID EXCEPT ![r] = m.viewID ]
/\ vEarlyBuffer' = [ vEarlyBuffer EXCEPT ![r] = {} ]
/\ vLastNormView' = [ vLastNormView EXCEPT ![r] = m.viewID ]
/\ vViewChanges' = [vViewChanges EXCEPT ![r] = {} ]
/\ vReplicaStatus' = [ vReplicaStatus EXCEPT ![r] = StNormal ]
/\ vLateBuffer' = [ vLateBuffer EXCEPT ![r] = {} ]
/\ vTentativeSync' = [ vTentativeSync EXCEPT ![r] = m.sp ]
/\ vSyncReps' = [ vSyncReps EXCEPT ![r] = {} ]
/\ vCrashVectorReps' = [ vCrashVectorReps EXCEPT ![r] = {} ]
/\ vRecoveryReps' = [ vRecoveryReps EXCEPT ![r]= {} ]
/\ UNCHANGED << vReplicaClock, vUUIDCounter, clientVars >>
--------------------------------------------------------------------------------
\* `^\textbf{\large Leader Change}^'
\* Replica r starts a Leader change
StartLeaderChange(r) ==
/\ Send({[ mtype |-> MViewChangeReq,
sender |-> r,
dest |-> d,
viewID |-> vViewID[r] + 1,
cv |-> vCrashVector[r] ] : d \in Replicas})
/\ UNCHANGED << replicaVars, clientVars >>
\* `^\textbf{View Change Handlers}^'
\* Replica r gets MViewChangeReq, m
HandleViewChangeReq(r, m) ==
LET
currentViewID == vViewID[r]
newViewID == Max({currentViewID, m.viewID})
newLeaderNum == LeaderID(newViewID)
IN
\* Recovering replica does not participate in view change
/\ vReplicaStatus[r] /= StRecovering
/\ currentViewID /= newViewID
/\ CheckCrashVector(m, r)
/\ vReplicaStatus' = [ vReplicaStatus EXCEPT ![r] = StViewChange ]
/\ vViewID' = [ vViewID EXCEPT ![r] = newViewID ]
/\ vViewChanges' = [ vViewChanges EXCEPT ![r] = {} ]
/\ Send({[ mtype |-> MViewChange,
dest |-> Leader(newViewID),
sender |-> r,
viewID |-> newViewID,
lastNormal |-> vLastNormView[r],
syncedLog |-> SubSeq(vLog[r], 1, vSyncPoint[r]),
unsyncedLog|-> SubSeq(vLog[r], vSyncPoint[r]+1, Len(vLog[r])),
cv |-> vCrashVector[r] ]} \cup
\* Send the MViewChangeReqs in case this is an entirely new view
{[ mtype |-> MViewChangeReq,
sender |-> r,
dest |-> d,
viewID |-> newViewID,
cv |-> vCrashVector[r] ] : d \in Replicas})
/\ UNCHANGED << clientVars, vLog, vEarlyBuffer, vReplicaClock,
vLastNormView, vSyncPoint, vLateBuffer,
vTentativeSync, vSyncReps, vCommitPoint,
vUUIDCounter, vCrashVectorReps, vRecoveryReps >>
\* Replica r receives MViewChange, m
HandleViewChange(r, m) ==
\* Recovering replica does not participate in view change
/\ vReplicaStatus[r] /= StRecovering
\* Add the message to the log
/\ vViewID[r] = m.viewID
/\ vReplicaStatus[r] = StViewChange
\* This replica is the leader
/\ Leader(vViewID[r]) = r
/\ CheckCrashVector(m, r)
(* `~
/\ vViewChanges' = [ vViewChanges EXCEPT ![r] = vViewChanges[r] \cup {m}]
~'
*)
\* Note: Similar to vRecoveryReps, (potential) stray messages should be filtered out.
/\ vViewChanges' = [ vViewChanges EXCEPT
![r] = FilterStrayMessage(vViewChanges[r] \cup {m}, vCrashVector'[r]) ]
\* If there's enough replies, start the new view
/\ LET
isViewPromise(M) == /\ { n.sender : n \in M } \in Quorums
/\ \E n \in M : n.sender = r
vCMs == { n \in vViewChanges'[r] :
/\ n.mtype = MViewChange
/\ n.viewID = vViewID[r] }
\* Create the state for the new view
normalViews == { n.lastNormal : n \in vCMs }
\* Choose the largest normal view (i.e. the newest)
lastNormal == (CHOOSE v \in normalViews : \A v2 \in normalViews : v2 <= v)
\* For logs before vSyncPoint (i.e. syncedLog), we directly copy from the bestCandiates
\* For unsyncedLog, we do quorum check to decide which ones should be added to recovery Log
goodCandidates == { o \in vCMs : o.lastNormal = lastNormal }
\* bestCandidate can only be picked from goodCandidates,
\* because previous views may include invalid logs
bestCandidate == CHOOSE n \in goodCandidates:
\A y \in goodCandidates: Len(n.syncedLog) >= Len(y.syncedLog)
unSyncedLogs == { Seq2Set(n.unsyncedLog) : n \in goodCandidates }
IN
IF isViewPromise(vCMs) THEN
Send({[ mtype |-> MStartView,
dest |-> d,
viewID |-> vViewID[r],
log |-> bestCandidate.syncedLog
\o MergeUnSyncLogs(unSyncedLogs, LastLog(bestCandidate.syncedLog))
] : d \in Replicas })
ELSE
UNCHANGED networkVars
/\ UNCHANGED << clientVars, vLog, vEarlyBuffer, vViewID, vReplicaClock,
vLastNormView, vReplicaStatus, vSyncPoint, vLateBuffer,
vTentativeSync, vSyncReps,vCommitPoint,
vUUIDCounter, vCrashVectorReps, vRecoveryReps >>
\* Replica r receives a MStartView, m
HandleStartView(r, m) ==
/\ vReplicaStatus[r] /= StRecovering
/\ \/ vViewID[r] < m.viewID
\/ vViewID[r] = m.viewID /\ vReplicaStatus[r] = StViewChange
/\ CheckCrashVector(m, r)
/\ vLog' = [ vLog EXCEPT ![r] = m.log ]
/\ vReplicaStatus' = [ vReplicaStatus EXCEPT ![r] = StNormal ]
/\ vViewID' = [ vViewID EXCEPT ![r] = m.viewID ]
/\ vLastNormView' = [ vLastNormView EXCEPT ![r] = m.viewID ]
/\ vEarlyBuffer' = [ vEarlyBuffer EXCEPT ![r] = {} ] \* clear Early Buffer for the new view
/\ vLateBuffer' = [ vLateBuffer EXCEPT ![r] = {}] \* clear Late Buffer for the new view
/\ vSyncPoint' = [ vSyncPoint EXCEPT![r] = Len(m.log) ]
/\ vTentativeSync' = [ vTentativeSync EXCEPT![r] = Len(m.log) ]
\* Send replies (in the new view) for all log items
/\ IF r = Leader(m.viewID) THEN \* Leader only sends fast reply
Send({[ mtype |-> MFastReply,
sender |-> r,
dest |-> m.log[i].clientID,
viewID |-> m.viewID,
requestID |-> m.log[i].requestID,
hash |-> [
log |-> SubSeq(m.log, 1, i),
cv |-> vCrashVector
],
deadline |-> m.log[i].deadline,
logSlotNum |-> i ] : i \in (1..Len(m.log))})
ELSE \* While staring view, followers knows the log is synced with the leader, so send slow-reply
Send({[ mtype |-> MSlowReply,
sender |-> r,
dest |-> m.log[i].clientID,
viewID |-> m.viewID,
requestID |-> m.log[i].requestID,
logSlotNum |-> i ] : i \in (1..Len(m.log))})
/\ UNCHANGED << clientVars, vReplicaClock, vViewChanges,
vSyncReps, vCommitPoint, vCrashVector,
vUUIDCounter, vCrashVectorReps, vRecoveryReps >>
--------------------------------------------------------------------------------
\* `^\textbf{\large Periodic Synchronization}^'
\* Leader replica r conduct synchronization periodically
\* This periodic sync process is different from index sync process
\* It ensures that all replicas’ logs are stable up to their CommitPoint (for fast recovery)
\* Our CommitPoint is essentially the `^\emph{sync-point}^' defined in NOPaxos paper
\* Just as mentioned in NOPaxos paper, it is an optional optimization for fast recovery
\* Nezha still works even without this part
StartSync(r) ==
/\ Leader(vViewID[r]) = r
/\ vReplicaStatus[r] = StNormal
/\ vTentativeSync[r] < Len(vLog[r]) \* If >= then no need to sync
/\ vSyncReps' = [ vSyncReps EXCEPT ![r] = {} ]
/\ vTentativeSync' = [ vTentativeSync EXCEPT ![r] = Len(vLog[r]) ]
/\ Send({[ mtype |-> MSyncPrepare,
sender |-> r,
dest |-> d,
viewID |-> vViewID[r],
log |-> vLog[r] ] : d \in Replicas })
/\ UNCHANGED << clientVars, vLog, vEarlyBuffer, vViewID, vReplicaClock,
vLastNormView, vViewChanges, vReplicaStatus,
vSyncPoint, vLateBuffer, vCommitPoint,
vUUIDCounter, vCrashVector,
vCrashVectorReps, vRecoveryReps >>
\* Replica r receives MSyncPrepare, m
HandleSyncPrepare(r, m) ==
LET
newLog == m.log \o GetUnSyncLogs(vLog[r], LastLog(m.log) )
IN
/\ vReplicaStatus[r] = StNormal
/\ m.viewID = vViewID[r]
/\ m.sender = Leader(vViewID[r])
/\ IF vSyncPoint[r] < Len(m.log) THEN
/\ vSyncPoint' = [vSyncPoint EXCEPT ![r] = Len(m.log)]
/\ vLog' = [ vLog EXCEPT ![r] = newLog ]
/\ Send({[ mtype |-> MSlowReply,
sender |-> r,
dest |-> m.log[i].clientID,
viewID |-> m.viewID,
requestID |-> m.log[i].requestID,
logSlotNum |-> i ] : i \in (1..Len(m.log))})
ELSE
UNCHANGED <>
/\ Send({[ mtype |-> MSyncRep,
sender |-> r,
dest |-> m.sender,
viewID |-> vViewID[r],
logSlotNumber |-> Len(m.log) ]}
)
/\ UNCHANGED <>
\* Replica r receives MSyncRep, m
HandleSyncRep(r, m) ==
/\ m.viewID = vViewID[r]
/\ vReplicaStatus[r] = StNormal
/\ vSyncReps' = [ vSyncReps EXCEPT ![r] = vSyncReps[r] \cup { m } ]
/\ LET isViewPromise(M) == /\ { n.sender : n \in M } \in Quorums
/\ \E n \in M : n.sender = r
sRMs == { n \in vSyncReps'[r] :
/\ n.mtype = MSyncRep
/\ n.viewID = vViewID[r]
/\ n.logSlotNumber = vTentativeSync[r] }
committedLog == IF vTentativeSync[r] >= 1 THEN
SubSeq(vLog[r], 1, vTentativeSync[r])
ELSE
<< >>
IN
IF isViewPromise(sRMs) THEN
/\ Send({[ mtype |-> MSyncCommit,
sender |-> r,
dest |-> d,
viewID |-> vViewID[r],
log |-> committedLog] :
d \in Replicas })
/\ vCommitPoint' = [ vCommitPoint EXCEPT ![r] = vTentativeSync[r] ]
ELSE
UNCHANGED << networkVars, vCommitPoint >>
/\ UNCHANGED << clientVars, vLog, vEarlyBuffer, vViewID,
vReplicaClock, vLastNormView, vViewChanges,
vReplicaStatus, vSyncPoint, vLateBuffer,
vTentativeSync, vUUIDCounter, vCrashVector,
vCrashVectorReps, vRecoveryReps >>
\* Replica r receives MSyncCommit, m
HandleSyncCommit(r, m) ==
LET
newLog == m.log \o GetUnSyncLogs(vLog[r], LastLog(m.log) )
IN
/\ vReplicaStatus[r] = StNormal
/\ m.viewID = vViewID[r]
/\ m.sender = Leader(vViewID[r])
/\ IF Len(m.log) <= vCommitPoint[r] THEN
UNCHANGED <>
ELSE
/\ vLog' = [ vLog EXCEPT ![r] = newLog ]
/\ vCommitPoint' = [ vCommitPoint EXCEPT ![r] = Len(m.log) ]
/\ Send({[ mtype |-> MSlowReply,
sender |-> r,
dest |-> m.log[i].clientID,
viewID |-> m.viewID,
requestID |-> m.log[i].requestID,
logSlotNum |-> i ] : i \in (1..Len(m.log))})
/\ UNCHANGED << networkVars, clientVars, vEarlyBuffer,
vViewID, vReplicaClock, vLastNormView, vViewChanges,
vReplicaStatus, vSyncPoint, vLateBuffer,
vTentativeSync, vSyncReps,
vUUIDCounter, vCrashVector,
vCrashVectorReps, vRecoveryReps >>
--------------------------------------------------------------------------------
(* `^\textbf{\large Invariants and Helper Functions}^' *)
(*
A request/log is committed in two possible cases:
(1) A fast quorum has sent either slow-reply messages, or fast-reply messages with consistent hashes [Fast Path]
(2) A simple quorum has sent slow-reply messages [Slow Path]
Both quorums should include the leader
*)
\* Check whether log is committed at position logSlotNum
Committed(clientID, requestID, logSlotNum) ==
\* Fast path
\/ \E M \in SUBSET ({m \in messages : /\ \/ m.mtype = MFastReply
\/ m.mtype = MSlowReply
/\ m.logSlotNum = logSlotNum
/\ m.dest = clientID
/\ m.requestID = requestID }) :
\* Sent from a fast quorum
/\ { m.sender : m \in M } \in FastQuorums
\* Matching view-id
/\ \E m1 \in M : \A m2 \in M : m1.viewID = m2.viewID
\* One from the leader
/\ \E m \in M : m.sender = Leader(m.viewID)
\* Hash values are consistent
/\ LET
leaderReply == CHOOSE m \in M : m.sender = Leader(m.viewID)
IN
\A m1 \in M : IF m1.mtype = MFastReply THEN
m1.hash = leaderReply.hash
ELSE
TRUE \* SlowReply has consistent hash for sure
\* Slow path
\/ \E M \in SUBSET ({m \in messages : /\ \/ m.mtype = MSlowReply
\/ /\ m.mtype = MFastReply \* Leader only sends fast-reply
/\ m.sender =Leader(m.viewID)
/\ m.logSlotNum = logSlotNum
/\ m.dest = clientID
/\ m.requestID = requestID }) :
/\ { m.sender : m \in M } \in Quorums
\* Matching view-id
/\ \E m1 \in M : \A m2 \in M : m1.viewID = m2.viewID
\* One from the leader
/\ \E m \in M : m.sender = Leader(m.viewID)
\* Check whether log is committed in view viewID
CommittedInView(clientID, requestID, viewID) ==
\* Fast path
\/ \E M \in SUBSET ({m \in messages : /\ \/ m.mtype = MFastReply
\/ m.mtype = MSlowReply
/\ m.dest = clientID
/\ m.requestID = requestID
/\ m.viewID = viewID}) :
\* Sent from a fast quorum
/\ { m.sender : m \in M } \in FastQuorums
\* One from the leader
/\ \E m \in M : m.sender = Leader(m.viewID)
\* Hash values are the same
/\ LET
leaderReply == CHOOSE m \in M : m.sender = Leader(m.viewID)
IN
\A m1 \in M : IF m1.mtype = MFastReply THEN
m1.hash = leaderReply.hash
ELSE
TRUE \* SlowReply has consistent hash for sure
\* Slow path
\/ \E M \in SUBSET ({m \in messages : /\ \/ m.mtype = MSlowReply
\/ /\ m.mtype = MFastReply \* Leader only sends fast-reply
/\ m.sender = Leader(m.viewID)
/\ m.dest = clientID
/\ m.requestID = requestID
/\ m.viewID = viewID}) :
/\ { m.sender : m \in M } \in Quorums
\* Hash values are the same
/\ \E m1 \in M : \A m2 \in M : m1.hash = m2.hash
\* One from the leader
/\ \E m \in M : m.sender = Leader(m.viewID)
SystemRecovered(viewID) == /\ \E RM \in SUBSET(Replicas):
/\ Cardinality(RM) >= QuorumSize
/\ \A r \in RM: vLastNormView[r] >= viewID
/\ \A r \in RM: vReplicaStatus[r] = StNormal \* These replicas must be normal
\* The leader of this view has also recovered or even goes beyond this view
/\ vLastNormView[Leader(viewID)] >= viewID
(* `^\textbf{Invariants}^' *)
\* Durability: Committed Requests always survive failure
\* i.e. If a request is committed in one view, then it will remain committed in the higher views
\* One thing to note, the check of "committed" only happens when the system is still "normal"
\* While the system is under recovery (i.e. less than f+1 replicas are normal),
\* the check of committed does not make sense
Durability == \A v1, v2 \in 1..MaxViews:
\* If a request is committed in lower view (v1,),
\* it is impossible to make this request uncommited in higher view (v2)
~(/\ v1 < v2
\* To check Durability of request in higher views,
\* the system should have entered the higher views
/\ SystemRecovered(v2)
/\ \E c \in Clients :
\E r \in 1..MaxReqNum:
/\ CommittedInView(c,r, v1)
/\ ~CommittedInView(c,r, v2))
\* Consistency: Committed requests have the same history even after view changes
\* i.e. If a request is committed in a lower view (v1), then (based on Durability Property)
\* it remains committed in higher view (v2)
\* Consistency requires the history of the request (i.e. all the request before this request) remain the same
Consistency ==
\A v1, v2 \in 1..MaxViews:
~(/\ v1 < v2
\* To check Consistency of request in higher views,
\* the system should have entered the higher views
/\ SystemRecovered(v2)
/\ \E c \in Clients :
\E r \in 1..MaxReqNum:
\E t \in 1..MaxTime:
\* Durability has been checked in another invariant
/\ CommittedInView(c,r, v1)
/\ CommittedInView(c,r, v2)
/\ LET
v1LeaderReply == CHOOSE m \in messages:
/\ m.mtype = MFastReply
/\ m.deadline = t
/\ m.dest = c
/\ m.requestID = r
/\ m.viewID = v1
/\ m.sender = Leader(v1)
v2LeaderReply == CHOOSE m \in messages:
/\ m.mtype = MFastReply
/\ m.deadline = t
/\ m.dest = c
/\ m.requestID = r
/\ m.viewID = v2
/\ m.sender = Leader(v2)
IN
v1LeaderReply.hash /= v2LeaderReply.hash)
\* Linearizability: Only one request can be committed for a given position
\* i.e. If one request has committed at position i, then no contrary observation can be made
\* i.e. there cannot be a second request committed at the same position
Linearizability ==
LET
maxLogPosition == Max({1} \cup
{ m.logSlotNum : m \in {m \in messages :
\/ m.mtype = MFastReply
\/ m.mtype = MSlowReply } })
IN ~(\E c1, c2 \in Clients :
\E r1, r2 \in 1..MaxReqNum:
/\ << c1, r1 >> /= << c2, r2 >>
/\ \E i \in (1 .. maxLogPosition) :
/\ Committed(c1, r1, i)
/\ Committed(c2, r2, i)
)
(* `~
SyncSafety == \A r \in Replicas :
\A i \in 1..vSyncPoint[r] :
IF SystemRecovered(vViewID[r]) THEN
\* Committed can only be checked when the system is recovered
\* (i.e. when there are f+1 replicas alive)
Committed(vLog[r][i].ta,vLog[r][i].clientID, vLog[r][i].reqID, i)
ELSE
TRUE
~'
*)
--------------------------------------------------------------------------------
(* `^\textbf{\large Main Transition Function}^' *)
Next == \* Handle Messages
\/ \E m \in messages :
/\ m.mtype = MClientRequest
/\ m \notin vReplicaProcessed[m.dest]
/\ HandleClientRequest(m.dest, m)
/\ vReplicaProcessed' =
[vReplicaProcessed EXCEPT ![m.dest] =
vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
/\ UNCHANGED vClientProcessed
/\ DebugAction' = << "HandleClientRequest", m >>
\/ \E m \in messages :
/\ m.mtype = MViewChangeReq
/\ m \notin vReplicaProcessed[m.dest]
/\ HandleViewChangeReq(m.dest, m)
/\ vReplicaProcessed' =
[vReplicaProcessed EXCEPT ![m.dest] =
vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
/\ UNCHANGED vClientProcessed
/\ DebugAction' = << "HandleViewChangeReq", m >>
\/ \E m \in messages :
/\ m.mtype = MViewChange
/\ m \notin vReplicaProcessed[m.dest]
/\ HandleViewChange(m.dest, m)
/\ vReplicaProcessed' =
[vReplicaProcessed EXCEPT ![m.dest] =
vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
/\ UNCHANGED vClientProcessed
/\ DebugAction' = << "HandleViewChange", m >>
\/ \E m \in messages :
/\ m.mtype = MStartView
/\ m \notin vReplicaProcessed[m.dest]
/\ HandleStartView(m.dest, m)
/\ vReplicaProcessed' =
[vReplicaProcessed EXCEPT ![m.dest] =
vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
/\ UNCHANGED vClientProcessed
/\ DebugAction' = << "HandleStartView", m >>
\/ \E m \in messages :
/\ m.mtype = MSyncPrepare
/\ m \notin vReplicaProcessed[m.dest]
/\ HandleSyncPrepare(m.dest, m)
/\ vReplicaProcessed' =
[vReplicaProcessed EXCEPT ![m.dest] =
vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
/\ UNCHANGED vClientProcessed
/\ DebugAction' = << "HandleSyncPrepare", m >>
\/ \E m \in messages :
/\ m.mtype = MSyncRep
/\ m \notin vReplicaProcessed[m.dest]
/\ HandleSyncRep(m.dest, m)
/\ vReplicaProcessed' =
[vReplicaProcessed EXCEPT ![m.dest] =
vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
/\ UNCHANGED vClientProcessed
/\ DebugAction' = << "HandleSyncRep", m >>
\/ \E m \in messages :
/\ m.mtype = MSyncCommit
/\ m \notin vReplicaProcessed[m.dest]
/\ HandleSyncCommit(m.dest, m)
/\ vReplicaProcessed' =
[vReplicaProcessed EXCEPT ![m.dest] =
vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
/\ UNCHANGED vClientProcessed
/\ DebugAction' = << "HandleSyncCommit", m >>
\/ \E m \in messages:
/\ m.mtype = MMissEntryRequest
/\ m \notin vReplicaProcessed[m.dest]
/\ HandleMissEntryRequest(m.dest, m)
/\ vReplicaProcessed' =
[vReplicaProcessed EXCEPT ![m.dest] =
vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
/\ UNCHANGED vClientProcessed
/\ DebugAction' = << "HandleMissEntryRequest", m >>
\/ \E m \in messages:
/\ m.mtype = MMissEntryReply
/\ m \notin vReplicaProcessed[m.dest]
/\ HandleMissEntryReply(m.dest, m)
/\ vReplicaProcessed' =
[vReplicaProcessed EXCEPT ![m.dest] =
vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
/\ UNCHANGED vClientProcessed
/\ DebugAction' = << "HandleMissEntryReply", m >>
\* Client Actions
\/ \E c \in Clients :
/\ vClientReqNum[c] < MaxReqNum
/\ ClientSendRequest(c)
/\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
/\ DebugAction' = << "ClientSendRequest", "" >>
\* Start Synchronization
\/ \E r \in Replicas :
/\ StartSync(r)
/\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
/\ DebugAction' = << "StartSync", "" >>
\* Replica Fail
\/ \E r \in Replicas :
/\ vReplicaStatus[r] = StNormal
/\ StartReplicaFail(r)
/\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
/\ DebugAction' = << "StartReplicaFail", "" >>
\* Leader Change
\/ \E r \in Replicas :
/\ vViewID[r] < MaxViews
/\ StartLeaderChange(r)
/\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
/\ DebugAction' = << "StartLeaderChange", "" >>
\* Replica Rejoin
\/ \E r \in Replicas :
/\ vReplicaStatus[r] = StRecovering
/\ StartReplicaRecovery(r)
/\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
/\ DebugAction' = << "StartReplicaRecovery", "" >>
\* Replica Actions:
\/ \E r \in Replicas:
/\ StartIndexSync(r)
/\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
/\ DebugAction' = << "StartIndexSync", "" >>
\/ \E r \in Replicas:
/\ FlushEarlyBuffer(r)
/\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
/\ DebugAction' = << "FlushReplicaBuffer", "" >>
\* Clock Move
\/ \E r \in Replicas :
/\ ReplicaClockMove(r)
/\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
/\ DebugAction' = << "ReplicaClockMove", "" >>
\/ \E c \in Clients :
/\ ClientClockMove(c)
/\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
/\ DebugAction' = << "ClientClockMove", "" >>
================================================================================
================================================
FILE: docs/demo.md
================================================
## One-Box Demo
We have prepared the configuration files in ```configs``` folder, these configuration files will be used to launch 3 replicas, 1 proxy and 1 client. Under ```configs``` folder, we have ```local``` folder (for the single-machine test), containing:
- nezha-replica-config-0.yaml
- nezha-replica-config-1.yaml
- nezha-replica-config-2.yaml
- nezha-proxy-config.yaml
- nezha-client-config.yaml
When running distributed tests, the user can refer to the template files (e.g., ```configs/nezha-replica-config-template.yaml```) to generate their customized config files (such as configuring the IP addresses in the config files).
Before running the experiment, we assume the user has generated and copied their configuration files into the ```$HOME/Nezha/configs``` folder.
### View Change Test
**Step 1**: Launch 3 replicas (i.e. replica-0, replica-1, replica-2). Open 3 terminals and launch one replica in each terminal.
```
# In the first terminal (replica-0)
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-0.yaml
# In the second terminal (replica-1)
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-1.yaml
# In the third terminal (replica-2)
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-2.yaml
```

**Step 2**: After the three replicas are launched, we can see the important information displayed from the console logs, e.g. the current view, the replica id of this replica, the number of replicas, the number of keys the maintained by each replica's state machine (for commutativity optimization)

**Step 3**: In view 0, the leader replica is ```viewId%replicaNum=0```, i.e. replica-0. Therefore, if we kill replica-0, we will trigger view change, so we use Crtl+C to kill replica-0

**Step 4**: After leader is killed, the remaining 2 replicas start view change to enter a new view, i.e., view 1. In this new view, the leader becomes ```viewId%replicaNum=1```, i.e., replica-1. Since there are still a majority of replicas (i.e., 2 replicas) alive, the system can resume service.

**Step 5**: We want the failed replica to rejoin the system. Therefore, we launch replica-0. This time, we set the flag ```isRecovering``` as true, so that it goes through the recovery procedure and retrieves the state from the other healthy replicas.
```
# In the first terminal
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-0.yaml --isRecovering true
```

**Step 6**: We can see that replica-0 rejoins the system as a follower, and the current view is 1.

The test process can be repeated. So long as there are always a majority of replicas (f+1) remaining, then the system is able to serve clients and failed replicas can also rejoin.
### Test with Client
**Step 0**: Kill all the processes launched in the previous section.
**Step 1**: Similar to the previous section, we launch 3 replicas. More than that, this time we also launch 1 proxy and 1 client. In the client configuration file (i.e. [nezha-client-config.yaml](configs/nezha-client-config.yaml) ), we have specified the client as an open-loop client, and it will submit at about 1000 requests/second. This time we need to open 5 terminals in total.
```
is-openloop: true
poisson-rate: 10
```
```
# In the first terminal (replica-0)
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-0.yaml
# In the second terminal (replica-1)
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-1.yaml
# In the third terminal (replica-2)
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-2.yaml
# In the fourth terminal (proxy)
$HOME/Nezha/bazel-bin/proxy/nezha_proxy --config $HOME/Nezha/configs/local/nezha-proxy-config.yaml
# In the fifth terminal (client-1)
$HOME/Nezha/bazel-bin/client/nezha_client --config $HOME/Nezha/configs/nezha-client-config.yaml
```

**Step 2**: After the client is launched, we can see it continues to submit requests and the proxy continues to forward requests for the client. For every 5 seconds, the client terminal will print a log to show the stats.

**Step 3**: While the client is submitting requests, we kill the leader (i.e. replica-0), we can see that the remaining 2 replicas rapidly complete the view change and get the new leader, which takes about ```1657418951138477-1657418950947251=191226us=191ms```. It can complete the view change so fast, because of the optimization of periodical synchronization (which has been explained in our paper). Because of the periodical synchronization, the new leader replica does not need to do state transfer from scratch, it just needs to do state transfer and log merge from the last commit point.

**Step 4**: We want the crashed replica (i.e. replica-0) to rejoin the system. So we set ```isRecovering``` flag as true.
```
# In the first terminal
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-0.yaml --isRecovering true
```

**Step 5**: The crashed replica starts from an empty state, so it needs to retrieve all the log entries in order to recover. Since we are using UDP and by default only fetch 5 entries during each round, the state transfer can take some time if clients have submitted many entries. As shown in the terminal of replica-0, we also print the progress of the recovery. But note that the follower's recovery does not block the other healthy replicas from serving the client. An optional optimization in consideration is to generate snapshot periodically and dump to stable storage. In this way, when a crashed replica wants to recover, it first fetches the state from local storage, and then does state transfer. In this way, it can save the recovery time.

**Step 6**: After replica-0 retrieves all the state, we can see it successfully recover and work as a follower.

================================================
FILE: docs/tla-intro.md
================================================
# Nezha TLA+
This repository includes a model-checked TLA+ specification (both the source file and the pdf version) for Nezha protocol. Besides, we also include a document to explain Nezha's recovery in pseudo-code.
================================================
FILE: external/gogoprotobuf.BUILD
================================================
package(default_visibility=['//visibility:public'])
proto_library(
name = "gogo_proto",
srcs = ["gogoproto/gogo.proto"],
deps = ["@com_google_protobuf//:descriptor_proto"]
)
================================================
FILE: external/googleapi.BUILD
================================================
package(default_visibility=['//visibility:public'])
proto_library(
name = 'annotations_proto',
srcs = ['google/api/annotations.proto'],
deps = [
":http_proto",
"@com_google_protobuf//:descriptor_proto"
],
)
proto_library(
name = 'http_proto',
srcs = ['google/api/http.proto']
)
================================================
FILE: lib/BUILD
================================================
load("@rules_proto//proto:defs.bzl", "proto_library")
cc_library(
name = "zipfian",
srcs = ["zipfian.h"],
hdrs = ["zipfian.h"],
visibility = ["//visibility:public"],
)
cc_library(
name = "common_type",
srcs = ["common_type.h"],
hdrs = ["common_type.h"],
visibility = ["//visibility:public"],
)
cc_library(
name = "common_struct",
srcs = ["common_struct.h"],
hdrs = ["common_struct.h"],
visibility = ["//visibility:public"],
deps = [
":common_type",
],
)
cc_library(
name = "address",
srcs = ["address.cc"],
hdrs = ["address.h"],
visibility = ["//visibility:public"],
)
cc_library(
name = "message_handler",
srcs = ["message_handler.h"],
hdrs = ["message_handler.h"],
visibility = ["//visibility:public"],
deps = [
":address",
":common_type",
],
)
cc_library(
name = "timer",
srcs = ["timer.h"],
hdrs = ["timer.h"],
visibility = ["//visibility:public"],
deps = [
":address",
":common_type",
],
)
cc_library(
name = "endpoint",
srcs = ["endpoint.cc"],
hdrs = ["endpoint.h"],
visibility = ["//visibility:public"],
deps = [
":address",
":common_struct",
":message_handler",
":timer",
"@com_github_enki_libev//:libev",
"@com_github_google_glog//:glog",
"@com_google_protobuf//:protobuf",
"@openssl//:openssl",
],
)
cc_library(
name = "udp_socket_endpoint",
srcs = ["udp_socket_endpoint.cc"],
hdrs = ["udp_socket_endpoint.h"],
visibility = ["//visibility:public"],
deps = [
":address",
":endpoint",
"@com_github_enki_libev//:libev",
"@com_google_protobuf//:protobuf",
"@openssl//:openssl",
],
)
cc_library(
name = "utils",
srcs = ["utils.cc"],
hdrs = ["utils.h"],
deps = [
":udp_socket_endpoint",
"@com_github_cameron314_concurrentqueue//:concurrentqueue",
"@com_github_preshing_junction//:libjunction",
"@com_github_gflags_gflags//:gflags",
"@com_github_google_glog//:glog",
"@openssl//:openssl",
],
visibility = ["//visibility:public"],
)
================================================
FILE: lib/Rules.mk
================================================
d := $(dir $(lastword $(MAKEFILE_LIST)))
SRCS += $(addprefix $(d), \
address.cc utils.cc udp_socket_endpoint.cc)
LIB-address := $(o)address.o
LIB-utils := $(o)utils.o
LIB-udp-socket := $(o)udp_socket_endpoint.o $(LIB-address) $(LIB-utils)
$(info LIB-udp-socket is $(LIB-udp-socket))
# include $(d)tests/Rules.mk
================================================
FILE: lib/address.cc
================================================
#include "lib/address.h"
Address::Address() : ip_(""), port_(-1), mac_("") {
bzero(&addr_, sizeof(addr_));
}
Address::Address(const std::string& ip, const int port, const std::string& mac)
: ip_(ip), port_(port), mac_(mac) {
bzero(&addr_, sizeof(addr_));
addr_.sin_family = AF_INET;
addr_.sin_port = htons(port);
addr_.sin_addr.s_addr = inet_addr(ip.c_str());
}
Address::~Address() {}
std::string Address::GetIPAsString() {
ip_ = inet_ntoa(addr_.sin_addr);
return ip_;
}
int Address::GetPortAsInt() {
port_ = htons(addr_.sin_port);
return port_;
}
================================================
FILE: lib/address.h
================================================
#ifndef NEZHA_ADDRESS
#define NEZHA_ADDRESS
#include
#include
#include
#include
#include
#include
#include
#define UDP_BUFFER_SIZE (512)
/**
* The address of an endpoint is encapsulate as the Address Class.
* Now it mainly includes the socket-based information, such as ip and port, but
* we reserves the future possibility to extend to support other communication
* primitives, such as DPDK
*/
class Address {
public:
std::string ip_;
int port_;
std::string mac_; // For future extension (DPDK)
struct sockaddr_in addr_;
Address();
Address(const Address& addr)
: ip_(addr.ip_), port_(addr.port_), mac_(addr.mac_) {
memcpy(&addr_, &(addr.addr_), sizeof(struct sockaddr_in));
}
Address(const std::string& ip, const int port, const std::string& mac = "");
~Address();
std::string GetIPAsString();
int GetPortAsInt();
};
#endif
================================================
FILE: lib/common_struct.h
================================================
#ifndef NEZHA_COMMON_STRUCT_H
#define NEZHA_COMMON_STRUCT_H
#include
#include
#include
#include
#include
#include
#include "lib/common_type.h"
/**
* Nezha relies on proto messages to communicate.
* When the proto message has been serialized and is about to be sent by the
* endpoint, MessageHeader is prepended to the head of the proto message (refer
* to SendMsgTo in udp_socket_endpoint.h), which describes the type of proto
* message and its length. In this way, when the receiver endpoint receives the
* message, it can know the type and length of the proto message, then it can
* choose the proper way to deserialize it.
*/
struct MessageHeader {
char msgType;
uint32_t msgLen;
MessageHeader(const char t, const uint32_t l) : msgType(t), msgLen(l) {}
};
/**
* SHA_HASH is included in the FastReply message to represent the replica state
* of replica. More details at Sec 5.2 of our paper
* https://arxiv.org/pdf/2206.03285.pdf
*/
union SHA_HASH {
uint32_t item[5];
unsigned char hash[SHA_DIGEST_LENGTH];
SHA_HASH() { memset(item, 0, sizeof(uint32_t) * 5); }
SHA_HASH(const char* str, const uint32_t len) {
if (len >= SHA_DIGEST_LENGTH) {
memcpy(hash, str, SHA_DIGEST_LENGTH);
} else {
memcpy(hash, str, len);
}
}
SHA_HASH(const SHA_HASH& h) { memcpy(item, h.item, sizeof(uint32_t) * 5); }
SHA_HASH& operator=(const SHA_HASH& sh) {
memcpy(item, sh.item, sizeof(uint32_t) * 5);
return *this;
}
void XOR(const SHA_HASH& h) {
item[0] ^= h.item[0];
item[1] ^= h.item[1];
item[2] ^= h.item[2];
item[3] ^= h.item[3];
item[4] ^= h.item[4];
}
std::string toString() {
return (std::to_string(item[0]) + "-" + std::to_string(item[1]) + "-" +
std::to_string(item[2]) + "-" + std::to_string(item[3]) + "-" +
std::to_string(item[4]));
}
};
/** When request is received by the replica, it will be first converted to
* RequestBody, which includes all the useful information of the request */
struct RequestBody {
uint64_t deadline;
uint64_t reqKey; // reqKey uniquely identifies the request on this replica,
// it is concated by the clientId and reqId. With reqKey,
// the replica can easily check whether this request has
// been previously received or not.
uint32_t opKey; // opKey indicates which key the request is operating on (
// imagine we are working on a database system and different
// requests wil read/write different keys). opKey is
// important for commutativity optimization. dd
uint64_t proxyId; // proxyId indicates which proxy delivers the request to
// the replica, and later replicas will send the
// corresponding reply to the proxy.
std::string command; // command is the content to execute
bool isWrite;
RequestBody() {}
RequestBody(const uint64_t d, const uint64_t r, const uint32_t ok,
const uint64_t p, const std::string& cmd, const bool isw)
: deadline(d),
reqKey(r),
opKey(ok),
proxyId(p),
command(cmd),
isWrite(isw) {}
/** The following methods are used to compare different requests so as to
* decide their order*/
bool LessThan(const RequestBody& bigger) {
return (deadline < bigger.deadline ||
(deadline == bigger.deadline && reqKey < bigger.reqKey));
}
bool LessThan(const std::pair& bigger) {
return (deadline < bigger.first ||
(deadline == bigger.first && reqKey < bigger.second));
}
bool LessOrEqual(const RequestBody& bigger) {
return (deadline < bigger.deadline ||
(deadline == bigger.deadline && reqKey <= bigger.reqKey));
}
bool LessOrEqual(const std::pair& bigger) {
return (deadline < bigger.first ||
(deadline == bigger.first && reqKey <= bigger.second));
}
};
/**
* After RequestBody is processed and eventually replied, it will be converted
* into a LogEntry, and stored in the replica.
* LogEntry, compares with RequestBody, includes more information
*/
struct LogEntry {
// Request Body
RequestBody body;
SHA_HASH entryHash; // The hash value of this **single** entry
SHA_HASH logHash; // The accumulative hash, which is calculated based on all
// the log entries from the beginning to this entry
/** prevNonCommutative and nextNonCommutative organize the LogEntries as a
* skiplist, and easier and more efficient to traverse/modify/delete */
LogEntry* prevNonCommutative; // The previous non-commutative entry
LogEntry* nextNonCommutative; // The next non-commutative entry
LogEntry* prevNonCommutativeWrite; // The entry's prevNonCommutative may be a
// write, or may be a read
// But only the prevNonCommutativeWrite is used to calculate the incremental
// hash, see Sec 8.2 of Nezha's Technical Report
LogEntry* nextNonCommutativeWrite;
/** prev and next organizes the LogEntries as a link list, and easier to
* traverse/modify/delete */
LogEntry* prev; // The previous LogEntry pointer
LogEntry* next; // The next LogEntry pointer
std::string result; // The execution result of the LogEntry
char status; //
uint32_t logId; // The logId (the position of the LogEntry in the list) of
// the entry
LogEntry()
: prevNonCommutative(NULL),
nextNonCommutative(NULL),
prevNonCommutativeWrite(NULL),
nextNonCommutativeWrite(NULL),
prev(NULL),
next(NULL),
result(""),
status(EntryStatus::INITIAL),
logId(0) {}
LogEntry(const RequestBody& rb, const SHA_HASH& eh, const SHA_HASH& h,
LogEntry* prevNonComm = NULL, LogEntry* nextNonComm = NULL,
LogEntry* preNonCOmmW = NULL, LogEntry* nextNonCommW = NULL,
LogEntry* pre = NULL, LogEntry* nxt = NULL,
const std::string& re = "", const char sts = EntryStatus::INITIAL,
const uint32_t lid = 0)
: body(rb),
entryHash(eh),
logHash(h),
prevNonCommutative(prevNonComm),
nextNonCommutative(nextNonComm),
prevNonCommutativeWrite(preNonCOmmW),
nextNonCommutativeWrite(nextNonCommW),
prev(pre),
next(nxt),
result(re),
status(sts),
logId(lid) {}
LogEntry(const uint64_t d, const uint64_t r, const uint32_t ok,
const uint64_t p, const std::string& cmd, const bool& isw,
const SHA_HASH& eh, const SHA_HASH& h, LogEntry* prevNonComm = NULL,
LogEntry* nextNonComm = NULL, LogEntry* preNonCOmmW = NULL,
LogEntry* nextNonCommW = NULL, LogEntry* pre = NULL,
LogEntry* nxt = NULL, const std::string& re = "",
const char sts = EntryStatus::INITIAL, const uint32_t lid = 0)
: body(d, r, ok, p, cmd, isw),
entryHash(eh),
logHash(h),
prevNonCommutative(prevNonComm),
nextNonCommutative(nextNonComm),
prevNonCommutativeWrite(preNonCOmmW),
nextNonCommutativeWrite(nextNonCommW),
prev(pre),
next(nxt),
result(re),
status(sts),
logId(lid) {}
bool LessThan(const LogEntry& bigger) { return body.LessThan(bigger.body); }
bool LessThan(const std::pair& bigger) {
return body.LessThan(bigger);
}
bool LessOrEqual(const LogEntry& bigger) {
return body.LessOrEqual(bigger.body);
}
bool LessOrEqual(const std::pair& bigger) {
return body.LessOrEqual(bigger);
}
};
/**
* CrashVectorStruct is necessary for Nezha to avoid stray messages, details in
* Appendix A.1 and Appendix J of our paper
*/
struct CrashVectorStruct {
std::vector cv_;
uint32_t version_; // Newer crash vector will have a larger version_
SHA_HASH cvHash_;
CrashVectorStruct(const std::vector& c, const uint32_t v)
: cv_(c), version_(v) {
const uint32_t contentLen = c.size() * sizeof(uint32_t);
const unsigned char* content = (const unsigned char*)(void*)(c.data());
SHA1(content, contentLen, cvHash_.hash);
}
CrashVectorStruct(const CrashVectorStruct& c)
: cv_(c.cv_), version_(c.version_), cvHash_(c.cvHash_) {}
};
#endif
================================================
FILE: lib/common_type.h
================================================
#ifndef NEZHA_COMMON_TYPE_H
#define NEZHA_COMMON_TYPE_H
/** We currently only support UDP endpoint, and GRPC endpoint will be supported
* in the near future*/
enum EndpointType {
UDP_ENDPOINT = 1,
GRPC_ENDPOINT // To be supported
};
/** Refer to Sec 5 of our paper for detailed explanation of different replica
* statuses */
enum ReplicaStatus { NORMAL = 1, VIEWCHANGE, RECOVERING, TERMINATED };
/** A LogEntry is INITIAL at the beginning, then it may switch to either
* IN_PROCESS->PROCESSED->REPLIED or directly IN_LATEBUFFER */
enum EntryStatus {
INITIAL = 1,
IN_PROCESS,
IN_LATEBUFFER,
PROCESSED,
TO_SLOW_REPLY,
REPLIED
};
/**
* The message types are defined according to the proto files and the
* information will be included in each message to facilitate
* serialize/deserialize proto messages
*/
enum MessageType {
CLIENT_REQUEST = 1,
LEADER_REQUEST,
SYNC_INDEX,
MISSED_INDEX_ASK,
MISSED_REQ_ASK,
FAST_REPLY,
SLOW_REPLY,
COMMIT_REPLY,
MISSED_REQ,
VIEWCHANGE_REQ,
VIEWCHANGE_MSG,
START_VIEW,
STATE_TRANSFER_REQUEST,
STATE_TRANSFER_REPLY,
CRASH_VECTOR_REQUEST,
CRASH_VECTOR_REPLY,
RECOVERY_REQUEST,
RECOVERY_REPLY,
SYNC_STATUS_REPORT,
COMMIT_INSTRUCTION,
SUSPEND_REPLY,
ERROR_MSG
};
#endif
================================================
FILE: lib/endpoint.cc
================================================
#include "lib/endpoint.h"
Endpoint::Endpoint(const std::string& sip, const int sport,
const bool isMasterReceiver)
: addr_(sip, sport) {
evLoop_ = isMasterReceiver ? ev_default_loop() : ev_loop_new();
if (!evLoop_) {
LOG(ERROR) << "Event Loop error";
return;
}
}
Endpoint::~Endpoint() {
LoopBreak();
ev_loop_destroy(evLoop_);
}
bool Endpoint::RegisterTimer(Timer* timer) {
if (evLoop_ == NULL) {
LOG(ERROR) << "No evLoop!";
return false;
}
if (isTimerRegistered(timer)) {
LOG(ERROR) << "This timer has already been registered";
return false;
}
timer->attachedEndpoint_ = this;
eventTimers_.insert(timer);
ev_timer_again(evLoop_, timer->evTimer_);
return true;
}
bool Endpoint::UnRegisterTimer(Timer* timer) {
if (evLoop_ == NULL) {
LOG(ERROR) << "No evLoop!";
return false;
}
if (!isTimerRegistered(timer)) {
LOG(ERROR) << "The timer has not been registered ";
return false;
}
ev_timer_stop(evLoop_, timer->evTimer_);
eventTimers_.erase(timer);
return true;
}
void Endpoint::UnRegisterAllTimers() {
for (auto& t : eventTimers_) {
ev_timer_stop(evLoop_, t->evTimer_);
}
eventTimers_.clear();
}
bool Endpoint::isTimerRegistered(Timer* timer) {
return (eventTimers_.find(timer) != eventTimers_.end());
}
void Endpoint::LoopRun() { ev_run(evLoop_, 0); }
void Endpoint::LoopBreak() {
UnRegisterAllTimers();
ev_break(evLoop_, EVBREAK_ALL);
}
================================================
FILE: lib/endpoint.h
================================================
#ifndef NEZHA_ENDPOINT_H
#define NEZHA_ENDPOINT_H
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "lib/address.h"
#include "lib/common_struct.h"
#include "lib/message_handler.h"
#include "lib/timer.h"
/**
* Endpoint is the basic abstraction, and it can be derived to more specific
* endpoints, based on the communication primtive (e.g., UDPSocketEndpoint)
*
* An Endpoint supports three major functionalities:
* (1) Send/Receive messages;
* (2) Process the received messages according to (pre-registered) customized
* message handlers;
* (3) Conduct periodical actions according to (pre-registered)
* customized timer functions.
*/
class Endpoint {
protected:
/* The address of this endpoint */
Address addr_;
/** The socket fd it uses to send/recv messages */
int fd_;
/** The ev_loop struct from libev, which uses to handle io/timer events */
struct ev_loop* evLoop_;
/** One endpoint can have multiple timers registered. We maintain a set to
* avoid duplicate registration and check whether a specific timer has been
* registered or not.*/
std::set eventTimers_;
public:
int epId_; // The id of the endpoint, mainly for debug
/** The endpoint accepts an ip and port. If both are valid, it binds the
* socket fd to the ip:port. If isMasterReceiver is true, it creates the
* default loop with libev, otherwise, it creates new loop (refer to libev
* documentation for detailed explanation at
* https://metacpan.org/dist/EV/view/libev/ev.pod) */
Endpoint(const std::string& ip = "", const int port = -1,
const bool isMasterReceiver = false);
virtual ~Endpoint();
/** Send the message to the specific destination. The method needs to know the
* message type (3rd parameter) and include such information in the buffer */
virtual int SendMsgTo(const Address& dstAddr,
const google::protobuf::Message& msg,
const char msgType) = 0;
/** An endpoint potentially can have multiple message handlers registered, but
* our UDPSocketEndpoint implementation only supports at most one
* message handler for one endpoint. So we make them as virtual functions and
* different derived classes have their own implementation of the methods */
virtual bool RegisterMsgHandler(MessageHandler* msgHdl) = 0;
virtual bool UnRegisterMsgHandler(MessageHandler* msgHdl) = 0;
virtual bool isMsgHandlerRegistered(MessageHandler* msgHdl) = 0;
virtual void UnRegisterAllMsgHandlers() = 0;
/** Return true if the timer is successfully registered, otherwise (e.g. it
* has been registered before and has not been unreigstered), return false */
bool RegisterTimer(Timer* timer);
/** Return true if the timer is successfully registered, otherwise (e.g. the
* timer has not been registered before), return false */
bool UnRegisterTimer(Timer* timer);
/** Check whether the timer has been registered */
bool isTimerRegistered(Timer* timer);
void UnRegisterAllTimers();
void LoopRun();
void LoopBreak();
};
#endif
================================================
FILE: lib/message_handler.h
================================================
#ifndef NEZHA_MESSAGE_HANDLER_H
#define NEZHA_MESSAGE_HANDLER_H
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "lib/address.h"
#include "lib/common_type.h"
/**
* MessageHandler is an encapsulation of libev-based message handler (i.e.
* ev_io).
*
* After the message handler is created, it will be registered to a
* specific endpoint. Then, the callback func (i.e., MessageHandlerFunc) will be
* called every time this endpoint receives some messages.
*
* Currently, we only support UDP communication. Therefore, we only have one
* derived struct (UDPMsgHandler) from MessageHandler
*
* We will continue to support other types of endpoints. Correspondingly, there
* will be more derived struct added later
* **/
/**
* Para-1: MessageHeader* describes the type and length of the received message
* Para-2: char* is the payload of the message
* Para-3: Address* is the address of the sender
* Para-4: void* points to the (optional) context that is needed by the callback
* function(i.e., MessageHandlerFunc)
*/
typedef std::function
MessageHandlerFunc;
struct MessageHandler {
MessageHandlerFunc msgHandler_;
void* context_;
Address sender_;
struct ev_io* evWatcher_;
MessageHandler(MessageHandlerFunc msghdl, void* ctx = NULL)
: msgHandler_(msghdl), context_(ctx) {
evWatcher_ = new ev_io();
evWatcher_->data = (void*)this;
}
~MessageHandler() { delete evWatcher_; }
};
struct UDPMsgHandler : MessageHandler {
char buffer_[UDP_BUFFER_SIZE];
UDPMsgHandler(MessageHandlerFunc msghdl, void* ctx = NULL)
: MessageHandler(msghdl, ctx) {
ev_init(evWatcher_, [](struct ev_loop* loop, struct ev_io* w, int revents) {
UDPMsgHandler* m = (UDPMsgHandler*)(w->data);
socklen_t sockLen = sizeof(struct sockaddr_in);
int msgLen = recvfrom(w->fd, m->buffer_, UDP_BUFFER_SIZE, 0,
(struct sockaddr*)(&(m->sender_.addr_)), &sockLen);
if (msgLen > 0 && (uint32_t)msgLen > sizeof(MessageHeader)) {
MessageHeader* msgHeader = (MessageHeader*)(void*)(m->buffer_);
if (msgHeader->msgLen + sizeof(MessageHeader) >= (uint32_t)msgLen) {
m->msgHandler_(msgHeader, m->buffer_ + sizeof(MessageHeader),
&(m->sender_), m->context_);
}
}
});
}
~UDPMsgHandler() {}
};
#endif
================================================
FILE: lib/message_type.cc
================================================
#include "lib/message_type.h"
namespace MessageType {
char CLIENT_REQUEST = 1;
char LEADER_REQUEST = 2;
char SYNC_INDEX = 3;
char MISSED_INDEX_ASK = 4;
char MISSED_REQ_ASK = 5;
char FAST_REPLY = 6;
char SLOW_REPLY = 7;
char COMMIT_REPLY = 8;
char MISSED_REQ = 9;
char VIEWCHANGE_REQ = 10;
char VIEWCHANGE = 11;
char START_VIEW = 12;
char STATE_TRANSFER_REQUEST = 13;
char STATE_TRANSFER_REPLY = 14;
char CRASH_VECTOR_REQUEST = 15;
char CRASH_VECTOR_REPLY = 16;
char RECOVERY_REQUEST = 17;
char RECOVERY_REPLY = 18;
char SYNC_STATUS_REPORT = 19;
char COMMIT_INSTRUCTION = 20;
char SUSPEND_REPLY = 21;
char ERROR_MSG = 22;
};
================================================
FILE: lib/message_type.h
================================================
#include
#ifndef NEZHA_MESSAGE_TYPE_H
#define NEZHA_MESSAGE_TYPE_H
#define CONCURRENT_MAP_START_INDEX (2u)
#define CONCAT_UINT32(a, b) ((((uint64_t)a)<<32u)|(uint32_t)b)
#define HIGH_32BIT(a) ((uint32_t)(a>>32))
#define LOW_32BIT(a) ((uint32_t)a)
struct MessageHeader {
char msgType;
uint32_t msgLen;
MessageHeader(const char t, const uint32_t l) :msgType(t), msgLen(l) {}
};
namespace MessageType {
extern char CLIENT_REQUEST;
extern char LEADER_REQUEST;
extern char SYNC_INDEX;
extern char MISSED_INDEX_ASK;
extern char MISSED_REQ_ASK;
extern char FAST_REPLY;
extern char SLOW_REPLY;
extern char COMMIT_REPLY;
extern char MISSED_REQ;
extern char VIEWCHANGE_REQ;
extern char VIEWCHANGE;
extern char START_VIEW;
extern char STATE_TRANSFER_REQUEST;
extern char STATE_TRANSFER_REPLY;
extern char CRASH_VECTOR_REQUEST;
extern char CRASH_VECTOR_REPLY;
extern char RECOVERY_REQUEST;
extern char RECOVERY_REPLY;
extern char SYNC_STATUS_REPORT;
extern char COMMIT_INSTRUCTION;
extern char SUSPEND_REPLY;
extern char ERROR_MSG;
};
#endif
================================================
FILE: lib/timer.h
================================================
#ifndef NEZHA_TIMER_
#define NEZHA_TIMER_
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "lib/address.h"
#include "lib/common_type.h"
/**
* Timer is an encapsulation of libev-based message handler (i.e.
* ev_timer).
*
* After the timer is created, it will be registered to a
* specific endpoint, together with a period (measures in milliseconds). Then,
* the callback func (i.e., TimerFunc) will be called periodically until the
* timer is unregistered
* **/
/**
* Para-1: The first void* points to the context, that may be needed by the
* callback function(i.e., TimerFunc)
* Para-2: The first void* points to the endpoint that this timer is attached
* to. It can be passed into the function as NULL if the TimerFunc does not need
* it. But some TimerFunc (e.g., monitorTimer in replica) callback needs to know
* the endpoint it has attached to.
*/
typedef std::function TimerFunc;
struct Timer {
std::function timerFunc_;
void* context_;
void* attachedEndpoint_;
struct ev_timer* evTimer_;
Timer(TimerFunc timerf, uint32_t periodMs = 1, void* ctx = NULL,
void* aep = NULL)
: timerFunc_(timerf), context_(ctx), attachedEndpoint_(aep) {
evTimer_ = new ev_timer();
evTimer_->data = (void*)this;
evTimer_->repeat = periodMs * 1e-3;
ev_init(evTimer_,
[](struct ev_loop* loop, struct ev_timer* w, int revents) {
Timer* t = (Timer*)(w->data);
t->timerFunc_(t->context_, t->attachedEndpoint_);
});
}
~Timer() { delete evTimer_; }
};
#endif
================================================
FILE: lib/udp_socket_endpoint.cc
================================================
#include "lib/udp_socket_endpoint.h"
UDPSocketEndpoint::UDPSocketEndpoint(const std::string& ip, const int port,
const bool isMasterReceiver)
: Endpoint(ip, port, isMasterReceiver), msgHandler_(NULL) {
fd_ = socket(PF_INET, SOCK_DGRAM, 0);
if (fd_ < 0) {
LOG(ERROR) << "Receiver Fd fail ";
return;
}
// Set Non-Blocking
int status = fcntl(fd_, F_SETFL, fcntl(fd_, F_GETFL, 0) | O_NONBLOCK);
if (status < 0) {
LOG(ERROR) << " Set NonBlocking Fail";
}
if (ip == "" || port < 0) {
return;
}
struct sockaddr_in addr;
bzero(&addr, sizeof(addr));
addr.sin_family = AF_INET;
addr.sin_port = htons(port);
addr.sin_addr.s_addr = inet_addr(ip.c_str());
// Bind socket to Address
int bindRet = bind(fd_, (struct sockaddr*)&addr, sizeof(addr));
if (bindRet != 0) {
LOG(ERROR) << "bind error\t" << bindRet << "\t port=" << port;
return;
}
}
UDPSocketEndpoint::~UDPSocketEndpoint() {}
int UDPSocketEndpoint::SendMsgTo(const Address& dstAddr,
const google::protobuf::Message& msg,
char msgType) {
char buffer[UDP_BUFFER_SIZE];
MessageHeader* msgHdr = (MessageHeader*)(void*)buffer;
msgHdr->msgType = msgType;
std::string serializedString = msg.SerializeAsString();
msgHdr->msgLen = serializedString.length();
if (serializedString.length() + sizeof(MessageHeader) > UDP_BUFFER_SIZE) {
LOG(ERROR) << "Msg too large " << (uint32_t)msgType
<< "\t length=" << serializedString.length();
return -1;
}
if (msgHdr->msgLen > 0) {
// Serialization succeed
// Prepend MesageHeader to the serialized string
memcpy(buffer + sizeof(MessageHeader), serializedString.c_str(),
msgHdr->msgLen);
int ret = sendto(fd_, buffer, msgHdr->msgLen + sizeof(MessageHeader), 0,
(struct sockaddr*)(&(dstAddr.addr_)), sizeof(sockaddr_in));
if (ret < 0) {
VLOG(1) << pthread_self() << "\tSend Fail ret =" << ret;
}
return ret;
}
return -1;
}
bool UDPSocketEndpoint::RegisterMsgHandler(MessageHandler* msgHdl) {
UDPMsgHandler* udpMsgHdl = (UDPMsgHandler*)msgHdl;
if (evLoop_ == NULL) {
LOG(ERROR) << "No evLoop!";
return false;
}
if (isMsgHandlerRegistered(msgHdl)) {
LOG(ERROR) << "This msgHdl has already been registered";
return false;
}
msgHandler_ = udpMsgHdl;
ev_io_set(udpMsgHdl->evWatcher_, fd_, EV_READ);
ev_io_start(evLoop_, udpMsgHdl->evWatcher_);
return true;
}
bool UDPSocketEndpoint::UnRegisterMsgHandler(MessageHandler* msgHdl) {
UDPMsgHandler* udpMsgHdl = (UDPMsgHandler*)msgHdl;
if (evLoop_ == NULL) {
LOG(ERROR) << "No evLoop!";
return false;
}
if (!isMsgHandlerRegistered(udpMsgHdl)) {
LOG(ERROR) << "The handler has not been registered ";
return false;
}
ev_io_stop(evLoop_, udpMsgHdl->evWatcher_);
msgHandler_ = NULL;
return true;
}
bool UDPSocketEndpoint::isMsgHandlerRegistered(MessageHandler* msgHdl) {
return (UDPMsgHandler*)msgHdl == msgHandler_;
}
void UDPSocketEndpoint::UnRegisterAllMsgHandlers() {
ev_io_stop(evLoop_, msgHandler_->evWatcher_);
msgHandler_ = NULL;
}
================================================
FILE: lib/udp_socket_endpoint.h
================================================
#ifndef NEZHA_UDP_SOCKET_SENDER_H
#define NEZHA_UDP_SOCKET_SENDER_H
#include "lib/endpoint.h"
class UDPSocketEndpoint : public Endpoint {
private:
/* data */
struct UDPMsgHandler* msgHandler_;
public:
UDPSocketEndpoint(const std::string& ip = "", const int port = -1,
const bool isMasterReceiver = false);
~UDPSocketEndpoint();
int SendMsgTo(const Address& dstAddr, const google::protobuf::Message& msg,
const char msgType) override;
bool RegisterMsgHandler(MessageHandler* msgHdl) override;
bool UnRegisterMsgHandler(MessageHandler* msgHdl) override;
bool isMsgHandlerRegistered(MessageHandler* msgHdl) override;
void UnRegisterAllMsgHandlers() override;
};
#endif
================================================
FILE: lib/utils.cc
================================================
#include "lib/utils.h"
SHA_HASH CalculateHash(uint64_t deadline, uint64_t reqKey) {
SHA_HASH hash;
const uint32_t contentLen =
sizeof(uint64_t) + sizeof(uint32_t) + sizeof(uint32_t);
unsigned char content[contentLen];
memcpy(content, &deadline, sizeof(uint64_t));
memcpy(content + sizeof(uint64_t), &reqKey, sizeof(uint64_t));
SHA1(content, contentLen, hash.hash);
return hash;
}
// Get Current Microsecond Timestamp
uint64_t GetMicrosecondTimestamp() {
auto tse = std::chrono::system_clock::now().time_since_epoch();
return std::chrono::duration_cast(tse).count();
}
Endpoint* CreateEndpoint(const char endpointType, const std::string& sip,
const int sport, const bool isMasterReceiver) {
if (endpointType == EndpointType::UDP_ENDPOINT) {
return new UDPSocketEndpoint(sip, sport, isMasterReceiver);
} else if (endpointType == EndpointType::GRPC_ENDPOINT) {
// To support GRPC later
return NULL;
} else {
LOG(ERROR) << "Unknown endpoint type: " << endpointType;
return NULL;
}
}
MessageHandler* CreateMsgHandler(const char endpointType,
MessageHandlerFunc msghdl, void* ctx) {
if (endpointType == EndpointType::UDP_ENDPOINT) {
return new UDPMsgHandler(msghdl, ctx);
} else if (endpointType == EndpointType::GRPC_ENDPOINT) {
// To support GRPC later
return NULL;
} else {
LOG(ERROR) << "Unknown endpoint type: " << endpointType;
return NULL;
}
}
================================================
FILE: lib/utils.h
================================================
#ifndef NEZHA_UTILS_H
#define NEZHA_UTILS_H
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "concurrentqueue.h"
#include "gflags/gflags.h"
#include "lib/udp_socket_endpoint.h"
template
using ConcurrentQueue = moodycamel::ConcurrentQueue;
template
using ConcurrentMap = junction::ConcurrentMap_Leapfrog;
/** The concurrent map we used (i.e.junction::ConcurrentMap) reserves 0 and 1 ,
* so the start value should be 2 */
#define CONCURRENT_MAP_START_INDEX (2u)
#define CONCAT_UINT32(a, b) ((((uint64_t)a) << 32u) | (uint32_t)b)
/** Get the high/low 32bits of a uint64 */
#define HIGH_32BIT(a) ((uint32_t)(a >> 32))
#define LOW_32BIT(a) ((uint32_t)a)
// Since is sufficient to uniquely identify one request, we
// calculate hash based on them to represent the corresponding request/log
SHA_HASH CalculateHash(uint64_t deadline, uint64_t reqKey);
// Get Current Microsecond Timestamp
uint64_t GetMicrosecondTimestamp();
// Factory function, to create different types of endpoints and msghandlers
Endpoint* CreateEndpoint(const char endpointType, const std::string& sip = "",
const int sport = -1,
const bool isMasterReceiver = false);
MessageHandler* CreateMsgHandler(
const char endpointType,
std::function msghdl,
void* ctx = NULL);
#endif
================================================
FILE: lib/zipfian.h
================================================
/*
* MIT License
*
* Copyright (c) 2017 Lucas Lersch
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/* Implementation derived from:
* "Quickly Generating Billion-Record Synthetic Databases", Jim Gray et al,
* SIGMOD 1994
*/
/*
* The zipfian_int_distribution class is intended to be compatible with other
* distributions introduced in #include by the C++11 standard.
*
* Usage example:
* #include
* #include "zipfian_int_distribution.h"
* int main()
* {
* std::default_random_engine generator;
* zipfian_int_distribution distribution(1, 10, 0.99);
* int i = distribution(generator);
* }
*/
/*
* IMPORTANT: constructing the distribution object requires calculating the zeta
* value which becomes prohibetively expensive for very large ranges. As an
* alternative for such cases, the user can pass the pre-calculated values and
* avoid the calculation every time.
*
* Usage example:
* #include
* #include "zipfian_int_distribution.h"
* int main()
* {
* std::default_random_engine generator;
* zipfian_int_distribution::param_type p(1, 1e6, 0.99, 27.000);
* zipfian_int_distribution distribution(p);
* int i = distribution(generator);
* }
*/
#include
#include
#include
#include
template
class zipfian_int_distribution
{
static_assert(std::is_integral<_IntType>::value, "Template argument not an integral type.");
public:
/** The type of the range of the distribution. */
typedef _IntType result_type;
/** Parameter type. */
struct param_type
{
typedef zipfian_int_distribution<_IntType> distribution_type;
explicit param_type(_IntType __a = 0, _IntType __b = std::numeric_limits<_IntType>::max(), double __theta = 0.99)
: _M_a(__a), _M_b(__b), _M_theta(__theta),
_M_zeta(zeta(_M_b - _M_a + 1, __theta)), _M_zeta2theta(zeta(2, __theta))
{
assert(_M_a <= _M_b && _M_theta > 0.0 && _M_theta < 1.0);
}
explicit param_type(_IntType __a, _IntType __b, double __theta, double __zeta)
: _M_a(__a), _M_b(__b), _M_theta(__theta), _M_zeta(__zeta),
_M_zeta2theta(zeta(2, __theta))
{
__glibcxx_assert(_M_a <= _M_b && _M_theta > 0.0 && _M_theta < 1.0);
}
result_type a() const { return _M_a; }
result_type b() const { return _M_b; }
double theta() const { return _M_theta; }
double zeta() const { return _M_zeta; }
double zeta2theta() const { return _M_zeta2theta; }
friend bool operator==(const param_type& __p1, const param_type& __p2)
{
return __p1._M_a == __p2._M_a
&& __p1._M_b == __p2._M_b
&& __p1._M_theta == __p2._M_theta
&& __p1._M_zeta == __p2._M_zeta
&& __p1._M_zeta2theta == __p2._M_zeta2theta;
}
private:
_IntType _M_a;
_IntType _M_b;
double _M_theta;
double _M_zeta;
double _M_zeta2theta;
/**
* @brief Calculates zeta.
*
* @param __n [IN] The size of the domain.
* @param __theta [IN] The skew factor of the distribution.
*/
double zeta(unsigned long __n, double __theta)
{
double ans = 0.0;
for (unsigned long i = 1; i <= __n; ++i)
ans += std::pow(1.0 / i, __theta);
return ans;
}
};
public:
/**
* @brief Constructs a zipfian_int_distribution object.
*
* @param __a [IN] The lower bound of the distribution.
* @param __b [IN] The upper bound of the distribution.
* @param __theta [IN] The skew factor of the distribution.
*/
explicit zipfian_int_distribution(_IntType __a = _IntType(0), _IntType __b = _IntType(1), double __theta = 0.99)
: _M_param(__a, __b, __theta)
{ }
explicit zipfian_int_distribution(const param_type& __p) : _M_param(__p)
{ }
/**
* @brief Resets the distribution state.
*
* Does nothing for the zipfian int distribution.
*/
void reset() { }
result_type a() const { return _M_param.a(); }
result_type b() const { return _M_param.b(); }
double theta() const { return _M_param.theta(); }
/**
* @brief Returns the parameter set of the distribution.
*/
param_type param() const { return _M_param; }
/**
* @brief Sets the parameter set of the distribution.
* @param __param The new parameter set of the distribution.
*/
void param(const param_type& __param) { _M_param = __param; }
/**
* @brief Returns the inclusive lower bound of the distribution range.
*/
result_type min() const { return this->a(); }
/**
* @brief Returns the inclusive upper bound of the distribution range.
*/
result_type max() const { return this->b(); }
/**
* @brief Generating functions.
*/
template
result_type operator()(_UniformRandomNumberGenerator& __urng)
{
return this->operator()(__urng, _M_param);
}
template
result_type operator()(_UniformRandomNumberGenerator& __urng, const param_type& __p)
{
double alpha = 1 / (1 - __p.theta());
double eta = (1 - std::pow(2.0 / (__p.b() - __p.a() + 1), 1 - __p.theta())) / (1 - __p.zeta2theta() / __p.zeta());
double u = std::generate_canonical::digits, _UniformRandomNumberGenerator>(__urng);
double uz = u * __p.zeta();
if (uz < 1.0) return __p.a();
if (uz < 1.0 + std::pow(0.5, __p.theta())) return __p.a() + 1;
return __p.a() + ((__p.b() - __p.a() + 1) * std::pow(eta * u - eta + 1, alpha));
}
/**
* @brief Return true if two zipfian int distributions have
* the same parameters.
*/
friend bool operator==(const zipfian_int_distribution& __d1, const zipfian_int_distribution& __d2)
{
return __d1._M_param == __d2._M_param;
}
private:
param_type _M_param;
};
================================================
FILE: license.md
================================================
MIT License
Copyright (c) 2022-2024 Jinkun Geng
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: micro-bench/BUILD
================================================
cc_binary(
name = "bench_sender",
srcs = ["bench_sender.cc"],
deps = [
"//proto:nezha_cc_proto",
"//lib:utils",
"//lib:address",
"//lib:zipfian",
],
copts = [
"-I/usr/local/include"
],
linkopts = [ "-L/usr/local/lib", "-lev", "-ldl", "-lturf", "-ljunction",
"-lcrypto", "-lgflags", "-lglog", "-lyaml-cpp", "-pthread" ],
)
cc_binary(
name = "bench_receiver",
srcs = ["bench_receiver.cc"],
deps = [
"//proto:nezha_cc_proto",
"//lib:utils",
"//lib:address",
"//lib:zipfian",
],
copts = [
"-I/usr/local/include"
],
linkopts = [ "-L/usr/local/lib", "-lev", "-ldl", "-lturf", "-ljunction",
"-lcrypto", "-lgflags", "-lglog", "-lyaml-cpp", "-pthread" ],
)
cc_binary(
name = "analysis",
srcs = ["analysis.cc"],
deps = [
"//proto:nezha_cc_proto",
"//lib:utils",
"//lib:address",
"//lib:zipfian",
],
copts = [
"-I/usr/local/include"
],
linkopts = [ "-L/usr/local/lib", "-lev", "-ldl", "-lturf", "-ljunction",
"-lcrypto", "-lgflags", "-lglog", "-lyaml-cpp", "-pthread" ],
)
================================================
FILE: micro-bench/analysis.cc
================================================
#include
#include
#include "lib/utils.h"
#include "lib/zipfian.h"
#include "proto/nezha_proto.pb.h"
DEFINE_string(folder, "/home/steam1994/micro-stats/2-10000-0-50",
"The folder of the csv");
DEFINE_int32(replica_num, 2, "The number of replicas");
int main(int argc, char* argv[]) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
google::InitGoogleLogging(argv[0]);
FLAGS_logtostderr = 1;
// std::vector zipfianKeys;
// uint32_t keyNum = 1000000;
// zipfianKeys.resize(1000000, 0);
// uint32_t skewFactor = 0.5;
// if (keyNum > 1) {
// std::default_random_engine generator(1); // clientId as the seed
// zipfian_int_distribution zipfianDistribution(0, keyNum - 1,
// skewFactor);
// for (uint32_t i = 0; i < zipfianKeys.size(); i++) {
// zipfianKeys[i] = zipfianDistribution(generator);
// }
// }
std::string r0Fname = FLAGS_folder + "/" + "Replica-Stats-0.csv";
std::ifstream ifs1(r0Fname);
LOG(INFO) << "fname=" << r0Fname;
uint32_t clientId, reqId;
uint32_t id = 0;
std::map mapIdx;
std::map mapKey;
while (ifs1 >> clientId >> reqId) {
uint64_t reqKey = CONCAT_UINT32(clientId, reqId);
mapIdx[reqKey] = id;
id++;
}
for (int i = 1; i < FLAGS_replica_num; i++) {
std::string r1Fname =
FLAGS_folder + "/" + "Replica-Stats-" + std::to_string(i) + ".csv";
std::ifstream ifs2(r1Fname);
std::vector reqKeys;
reqKeys.reserve(100000);
std::vector mappedIds;
mappedIds.reserve(100000);
while (ifs2 >> clientId >> reqId) {
uint64_t reqKey = CONCAT_UINT32(clientId, reqId);
reqKeys.push_back(reqKey);
mappedIds.push_back(mapIdx[reqKey]);
}
uint32_t reorderedCase = 0;
for (uint32_t i = 1; i < reqKeys.size(); i++) {
if (mappedIds[i] == 0 || mappedIds[i] < mappedIds[i - 1]) {
reorderedCase++;
}
}
LOG(INFO) << "reorderedCase=" << reorderedCase << "\t"
<< "total=" << id << "\t rate=" << reorderedCase * 1.0 / id;
}
}
================================================
FILE: micro-bench/bench_receiver.cc
================================================
#include
#include
#include "lib/utils.h"
#include "lib/zipfian.h"
#include "proto/nezha_proto.pb.h"
DEFINE_string(receiver_ip, "127.0.0.1", "The ip address of the receiver");
DEFINE_int32(receiver_port, 33333, "The port of the receiver");
DEFINE_int32(replica_id, 1, "The id of the replica");
DEFINE_int32(enable_dom, 0, "Whether enable DOM");
DEFINE_int32(percentile, 50, "The percentile of the owd estimation");
DEFINE_int32(client_port, 33336,
"The port of the client listens for OWD reply");
ConcurrentMap clientAddrs;
ConcurrentQueue> owdQu;
ConcurrentQueue processQu;
std::vector> traceVec;
void MsgHandlerFunc(MessageHeader* msgHeader, char* msgBuffer, Address* sender,
void* context) {
if (msgHeader->msgType == MessageType::CLIENT_REQUEST &&
msgHeader->msgLen > 0) {
nezha::proto::Request request;
if (request.ParseFromArray(msgBuffer, msgHeader->msgLen)) {
if (clientAddrs.get(request.clientid()) == NULL) {
Address* senderAddr =
new Address(sender->GetIPAsString(), FLAGS_client_port);
clientAddrs.assign(request.clientid(), senderAddr);
}
processQu.enqueue(request);
uint64_t nowTime = GetMicrosecondTimestamp();
if (nowTime > request.sendtime()) {
uint32_t owd = nowTime - request.sendtime();
owdQu.enqueue({request.clientid(), owd});
}
}
}
}
void ProcessTd() {
traceVec.reserve(10000000ul);
nezha::proto::Request request;
std::map, nezha::proto::Request> earlyBuffer;
uint64_t startTime = GetMicrosecondTimestamp();
LOG(INFO) << "FLAGS_enable_dom=" << FLAGS_enable_dom;
while (true) {
if (FLAGS_enable_dom == 1) {
if (processQu.try_dequeue(request)) {
uint64_t deadline = request.sendtime() + request.bound();
uint64_t reqKey = CONCAT_UINT32(request.clientid(), request.reqid());
earlyBuffer.insert({{deadline, reqKey}, request});
}
uint64_t nowTime = GetMicrosecondTimestamp();
while (earlyBuffer.empty() == false &&
earlyBuffer.begin()->first.first <= nowTime) {
traceVec.push_back({earlyBuffer.begin()->second.clientid(),
earlyBuffer.begin()->second.reqid()});
earlyBuffer.erase(earlyBuffer.begin());
if (traceVec.size() >= 10000000ul) {
break;
}
}
} else {
while (processQu.try_dequeue(request)) {
traceVec.push_back({request.clientid(), request.reqid()});
if (traceVec.size() >= 10000000ul) {
break;
}
}
}
uint64_t nowTime = GetMicrosecondTimestamp();
if (nowTime - startTime >= 60 * 1000ul * 1000ul ||
traceVec.size() >= 10000000ul) {
LOG(INFO) << "To terminated ..." << traceVec.size();
std::ofstream ofs("Replica-Stats-" + std::to_string(FLAGS_replica_id) +
".csv");
// ofs << "ClientID,ReqID" << std::endl;
for (auto& p : traceVec) {
ofs << p.first << "\t" << p.second << std::endl;
}
ofs.close();
exit(0);
}
}
}
void OWDTd() {
std::pair owdSample;
std::map> owdMap;
std::map owdCnt;
UDPSocketEndpoint* replyEP = dynamic_cast(
CreateEndpoint(EndpointType::UDP_ENDPOINT));
nezha::proto::Reply reply;
reply.set_replicaid(FLAGS_replica_id);
while (true) {
if (owdQu.try_dequeue(owdSample)) {
uint32_t senderId = owdSample.first;
uint32_t owd = owdSample.second;
if (owdMap.find(senderId) == owdMap.end()) {
owdMap[senderId].resize(1000);
owdCnt[senderId] = 0;
}
owdMap[senderId][owdCnt[senderId] % 1000] = owd;
owdCnt[senderId]++;
if (owdCnt[senderId] % 1000 == 0) {
std::vector temp = owdMap[senderId];
sort(temp.begin(), temp.end());
uint32_t estimate = temp[1000 * FLAGS_percentile / 100];
reply.set_clientid(senderId);
reply.set_owd(estimate +
10); // plus the 3 * error bound (sigma1+sigma2), the
// sigma ranges 1-3, here we plus 10 to simulate it
Address* clientAddr = clientAddrs.get(senderId);
if (clientAddr) {
// LOG(INFO) << "Send to " << senderId << "\t" << estimate;
replyEP->SendMsgTo(*clientAddr, reply, MessageType::FAST_REPLY);
}
}
}
}
}
int main(int argc, char* argv[]) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
google::InitGoogleLogging(argv[0]);
FLAGS_logtostderr = 1;
std::thread* processTd = new std::thread(ProcessTd);
std::thread* owdTd = new std::thread(OWDTd);
Endpoint* requestEP = CreateEndpoint(
EndpointType::UDP_ENDPOINT, FLAGS_receiver_ip, FLAGS_receiver_port, true);
UDPMsgHandler* msgHandler = new UDPMsgHandler(MsgHandlerFunc);
requestEP->RegisterMsgHandler(msgHandler);
requestEP->LoopRun();
processTd->join();
owdTd->join();
delete requestEP;
delete processTd;
delete owdTd;
}
================================================
FILE: micro-bench/bench_sender.cc
================================================
#include
#include
#include "lib/utils.h"
#include "lib/zipfian.h"
#include "proto/nezha_proto.pb.h"
DEFINE_string(receiver_1_ip, "127.0.0.1", "The ip address of the 1st receiver");
DEFINE_string(receiver_2_ip, "127.0.0.1", "The ip address of the 2nd receiver");
DEFINE_string(receiver_3_ip, "127.0.0.1", "The ip address of the 3rd receiver");
DEFINE_string(receiver_4_ip, "127.0.0.1", "The ip address of the 4th receiver");
DEFINE_string(receiver_5_ip, "127.0.0.1", "The ip address of the 5th receiver");
DEFINE_int32(receiver_1_port, 33333, "The port of the 1st receiver");
DEFINE_int32(receiver_2_port, 33333, "The port of the 2nd receiver");
DEFINE_int32(receiver_3_port, 33333, "The port of the 3rd receiver");
DEFINE_int32(receiver_4_port, 33333, "The port of the 4th receiver");
DEFINE_int32(receiver_5_port, 33333, "The port of the 5th receiver");
DEFINE_int32(receiver_num, 2, "The number of receivers to test");
DEFINE_string(client_ip, "127.0.0.1", "The ip address of the client");
DEFINE_int32(client_port, 33336,
"The port of the client listens for OWD reply");
DEFINE_uint64(poisson_rate, 10000, "Request Per Second");
DEFINE_uint64(duration, 60, "Duration of the experiment");
DEFINE_uint64(client_id, 1, "Client ID");
std::vector latencyBounds;
std::atomic bound;
void ReplyHandlerFunc(MessageHeader* msgHeader, char* msgBuffer,
Address* sender, void* context) {
if (msgHeader->msgType == MessageType::FAST_REPLY && msgHeader->msgLen > 0) {
nezha::proto::Reply reply;
if (reply.ParseFromArray(msgBuffer, msgHeader->msgLen)) {
// LOG(INFO) << "replyOWD " << reply.owd() << "\t" << reply.replicaid();
if (reply.owd() > 0 && reply.owd() < 200) {
latencyBounds[reply.replicaid()] = reply.owd();
auto it =
max_element(std::begin(latencyBounds), std::end(latencyBounds));
if (*it != bound) {
bound.store(*it);
}
}
}
}
}
void OWDUpdate() {
latencyBounds.resize(FLAGS_receiver_num, 80);
bound = 80;
UDPSocketEndpoint* replyEP = dynamic_cast(CreateEndpoint(
EndpointType::UDP_ENDPOINT, FLAGS_client_ip, FLAGS_client_port));
UDPMsgHandler* msgHandler = new UDPMsgHandler(ReplyHandlerFunc);
replyEP->RegisterMsgHandler(msgHandler);
replyEP->LoopRun();
}
int main(int argc, char* argv[]) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
google::InitGoogleLogging(argv[0]);
FLAGS_logtostderr = 1;
Endpoint* requestEP =
CreateEndpoint(EndpointType::UDP_ENDPOINT, "", -1, true);
LOG(INFO) << "ClientId = " << FLAGS_client_id << "\t"
<< " rate=" << FLAGS_poisson_rate;
std::vector reqPer10msVec;
reqPer10msVec.reserve(FLAGS_duration * 100);
std::default_random_engine generator(
FLAGS_client_id); // clientId as the seed
std::poisson_distribution distribution(FLAGS_poisson_rate / 100);
for (uint32_t i = 0; i < FLAGS_duration * 100; i++) {
reqPer10msVec.push_back(distribution(generator));
}
uint32_t maxReqId = FLAGS_poisson_rate * (FLAGS_duration - 10);
std::thread* replyTd = new std::thread(OWDUpdate);
uint32_t reqCnt = 0;
std::vector receiverAddrs;
receiverAddrs.resize(5, NULL);
receiverAddrs[0] = new Address(FLAGS_receiver_1_ip, FLAGS_receiver_1_port);
receiverAddrs[1] = new Address(FLAGS_receiver_2_ip, FLAGS_receiver_2_port);
receiverAddrs[2] = new Address(FLAGS_receiver_3_ip, FLAGS_receiver_3_port);
receiverAddrs[3] = new Address(FLAGS_receiver_4_ip, FLAGS_receiver_4_port);
receiverAddrs[4] = new Address(FLAGS_receiver_5_ip, FLAGS_receiver_5_port);
nezha::proto::Request request;
request.set_clientid(FLAGS_client_id);
srand(FLAGS_client_id);
for (uint32_t i = 0; i < reqPer10msVec.size(); i++) {
uint32_t reqNum = reqPer10msVec[i];
if (reqNum <= 0) {
usleep(10000);
} else {
uint32_t intval = 10000 / reqNum;
uint64_t nowTime = GetMicrosecondTimestamp();
for (uint32_t j = 1; j <= reqNum; j++) {
while (GetMicrosecondTimestamp() < nowTime + intval * j) {
}
uint64_t sendTime = GetMicrosecondTimestamp();
request.set_sendtime(sendTime);
request.set_bound(bound);
request.set_reqid(reqCnt + 1);
for (int k = 0; k < FLAGS_receiver_num; k++) {
requestEP->SendMsgTo(*(receiverAddrs[k]), request,
MessageType::CLIENT_REQUEST);
}
reqCnt++;
if (reqCnt >= maxReqId) {
LOG(INFO) << "reqCnt=" << reqCnt << "\tTerminate Here";
exit(0);
}
}
}
}
delete requestEP;
replyTd->join();
delete replyTd;
}
================================================
FILE: micro-bench/launch_micro.py
================================================
import os
import subprocess
from subprocess import PIPE, Popen
import time
import ruamel.yaml
from termcolor import colored
import argparse
LOGIN_PATH = "/home/steam1994"
TAG = "opensource-test"
SSH_KEY = "/home/steam1994/.ssh/id_rsa"
ssh_identity = '-i {}'.format(SSH_KEY) if SSH_KEY else ''
# Prefix for SSH and SCP.
SSH = 'ssh {} -q -o ConnectTimeout=2 -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no '.format(
ssh_identity)
SCP = 'scp -r {} -q -o ConnectTimeout=2 -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no '.format(
ssh_identity)
USERNAME = "steam1994"
CMD_RETRY_TIMES = 3
def generate_ttcs_cfg_file(internal_ip, is_reference=False, use_ntp=False):
if is_reference:
content_str = '''management_address: "InternalIP"
log_dir: "/var/opt/ttcs/log"
subscription_mode: true
coordinator_address: "c-gjk1994gjk1994-c89e.gcp.clockwork.io"
coordinator_subscription_service_port: 6176
probe_address: "InternalIP"
clock_quality: 10
correct_clock: false'''
cfg_file = content_str.replace("InternalIP", internal_ip)
cfg_file_name = "ttcs-agent.cfg"
with open(cfg_file_name, "w") as f:
f.write(cfg_file)
f.close()
return cfg_file_name
else:
if use_ntp:
content_str = '''management_address: "InternalIP"
log_dir: "/var/opt/ttcs/log"
subscription_mode: true
coordinator_address: "c-gjk1994gjk1994-c89e.gcp.clockwork.io"
coordinator_subscription_service_port: 6176
probe_address: "InternalIP"
clock_quality: 1
correct_clock: false'''
else:
content_str = '''management_address: "InternalIP"
log_dir: "/var/opt/ttcs/log"
subscription_mode: true
coordinator_address: "c-gjk1994gjk1994-c89e.gcp.clockwork.io"
coordinator_subscription_service_port: 6176
probe_address: "InternalIP"
clock_quality: 1
correct_clock: true'''
cfg_file = content_str.replace("InternalIP", internal_ip)
cfg_file_name = "ttcs-agent.cfg"
with open(cfg_file_name, "w") as f:
f.write(cfg_file)
f.close()
return cfg_file_name
def retry_proc_error(procs_list):
procs_error = []
for server, proc, cmd in procs_list:
output, err = proc.communicate()
if proc.returncode != 0:
proc = Popen(cmd.split(), stdout=PIPE, stderr=PIPE)
procs_error.append((server, proc, cmd))
return procs_error
def start_ttcs_node(internal_ip, is_reference, use_ntp=False):
clean_prev_deb_cmd = "sudo dpkg -P ttcs-agent"
run_command([internal_ip], clean_prev_deb_cmd, in_background=False)
install_deb_cmd = "sudo dpkg -i /home/steam1994/ttcs-agent_1.0.21_amd64.deb"
#install_deb_cmd = "sudo dpkg -i /root/ttcs-agent_1.0.12_amd64.deb"
run_command([internal_ip], install_deb_cmd, in_background=False)
cfg_file = generate_ttcs_cfg_file(internal_ip, is_reference, use_ntp)
local_file_path = "./ttcs-agent.cfg"
remote_dir = "/etc/opt/ttcs"
remote_path = remote_dir + "/ttcs-agent.cfg"
chmod_cmd = "sudo chmod -R 777 {remote_dir}".format(remote_dir=remote_dir)
run_command([internal_ip], chmod_cmd, in_background=False)
rm_cmd = "sudo rm -f {remote_path}".format(remote_path=remote_path)
run_command([internal_ip], rm_cmd, in_background=False)
scp_files([internal_ip], local_file_path, remote_path, to_remote=True)
if is_reference is not True and use_ntp is False:
stop_ntp_cmd = "sudo systemctl stop ntp"
run_command([internal_ip], stop_ntp_cmd, in_background=False)
disable_ntp_cmd = "sudo systemctl disable ntp"
run_command([internal_ip], disable_ntp_cmd, in_background=False)
stop_ntp_cmd = "sudo systemctl stop chronyd"
run_command([internal_ip], stop_ntp_cmd, in_background=False)
disable_ntp_cmd = "sudo systemctl disable chronyd"
run_command([internal_ip], disable_ntp_cmd, in_background=False)
else:
enable_ntp_cmd = "sudo systemctl enable chronyd"
run_command([internal_ip], enable_ntp_cmd, in_background=False)
start_ntp_cmd = "sudo systemctl start chronyd"
run_command([internal_ip], start_ntp_cmd, in_background=False)
sys_start_ttcp_agent_cmd = "sudo systemctl start ttcs-agent"
run_command([internal_ip], sys_start_ttcp_agent_cmd, in_background=False)
def launch_ttcs(server_ip_list):
stop_ntp_cmd = "sudo systemctl stop chronyd"
run_command(server_ip_list, stop_ntp_cmd, in_background=False)
disable_ntp_cmd = "sudo systemctl disable chronyd"
run_command(server_ip_list, disable_ntp_cmd, in_background=False)
stop_ntp_cmd = "sudo systemctl stop ntp"
run_command(server_ip_list, stop_ntp_cmd, in_background=False)
disable_ntp_cmd = "sudo systemctl disable ntp"
run_command(server_ip_list, disable_ntp_cmd, in_background=False)
sys_start_ttcp_agent_cmd = "sudo systemctl start ttcs-agent"
run_command(server_ip_list, sys_start_ttcp_agent_cmd, in_background=False)
def scp_files(server_ip_list, local_path_to_file, remote_dir, to_remote):
'''
copies the file in 'local_path_to_file' to the 'remote_dir' in all servers
whose external ip addresses are in 'server_ip_list'
args
server_ip_list: list of external IP addresses to communicate with
local_path_to_file: e.g. ./script.py
remote_dir: e.g. ~
to_remote: whether to copy to remote (true) or vice versa (false)
returns
boolean whether operation was succesful on all servers or not
'''
src = remote_dir if not to_remote else local_path_to_file
src_loc = 'remote' if not to_remote else 'local'
dst = remote_dir if to_remote else local_path_to_file
dst_loc = 'remote' if to_remote else 'local'
message = 'from ({src_loc}) {src} to ({dst_loc}) {dst}'.format(
src_loc=src_loc, src=src, dst_loc=dst_loc, dst=dst)
print('---- started scp {}'.format(message))
procs = []
for server in server_ip_list:
if to_remote:
cmd = '{} {} {}@{}:{}'.format(SCP, local_path_to_file,
USERNAME, server, remote_dir)
proc = Popen(cmd.split(), stdout=PIPE, stderr=PIPE)
else:
cmd = '{} {}@{}:{} {}'.format(SCP, USERNAME, server,
remote_dir, local_path_to_file)
proc = Popen(cmd.split(), stdout=PIPE, stderr=PIPE)
# print("scp cmd ", cmd)
procs.append((server, proc, cmd))
success = True
procs_error = retry_proc_error(procs)
retries = 1
while retries < CMD_RETRY_TIMES and procs_error:
procs_error = retry_proc_error(procs)
retries += 1
if retries >= CMD_RETRY_TIMES and procs_error:
success = False
for server, proc, cmd in procs_error:
output, err = proc.communicate()
if proc.returncode != 0:
print(
colored('[{}]: FAIL SCP - [{}]'.format(server, cmd),
'yellow'))
print(colored('Error Response:', 'blue', attrs=['bold']),
proc.returncode, output, err)
if success:
print(
colored('---- SUCCESS SCP {} on {}'.format(message,
str(server_ip_list)),
'green',
attrs=['bold']))
else:
print(
colored('---- FAIL SCP {}'.format(message), 'red', attrs=['bold']))
return success
def run_command(server_ip_list, cmd, in_background=True):
'''
runs the command 'cmd' in all servers whose external ip addresses are
in 'server_ip_list'
cfg
server_ip_list: list of external IP addresses to communicate with
cmd: command to run
returns
boolean whether operation was succesful on all servers or not
'''
if not in_background:
print('---- started to run command - [{}] on {}'.format(
cmd, str(server_ip_list)))
else:
print(
colored('---- started to run [IN BACKGROUND] command - [{}] on {}'.
format(cmd, str(server_ip_list)),
'blue',
attrs=['bold']))
procs = []
for server in server_ip_list:
ssh_cmd = '{} {}@{} {}'.format(SSH, USERNAME, server, cmd)
proc = Popen(ssh_cmd.split(), stdout=PIPE, stderr=PIPE)
procs.append((server, proc, ssh_cmd))
success = True
output = ''
if not in_background:
procs_error = retry_proc_error(procs)
retries = 1
while retries < CMD_RETRY_TIMES and procs_error:
procs_error = retry_proc_error(procs)
retries += 1
if retries >= CMD_RETRY_TIMES and procs_error:
success = False
for server, proc, cmd in procs_error:
output, err = proc.communicate()
if proc.returncode != 0:
print(
colored(
'[{}]: FAIL run command - [{}]'.format(
server, cmd), 'yellow'))
print(colored('Error Response:', 'blue', attrs=['bold']),
proc.returncode, output, err)
if success:
print(
colored('---- SUCCESS run command - [{}] on {}'.format(
cmd, str(server_ip_list)),
'green',
attrs=['bold']))
else:
print(
colored('---- FAIL run command - [{}]'.format(cmd),
'red',
attrs=['bold']))
return success, output
def create_instance(instance_name,
image=None,
machine_type = "n1-standard-4",
customzedZone = "us-central1-a",
customzedIp = None,
require_external_ip=False,
second_ip = False
):
# Construct gcloud command to create instance.
network_address_config = ("--network-interface no-address"
if require_external_ip == False else "")
if customzedIp is not None:
network_address_config += ",private-network-ip="+customzedIp
if second_ip:
network_address_config += " --network-interface subnet=subnet-1,no-address"
# scopes = "--scopes storage-full,https://www.googleapis.com/auth/bigtable.admin,https://www.googleapis.com/auth/bigtable.data,https://www.googleapis.com/auth/bigquery"
# if full_access_to_cloud_apis:
scopes = "--scopes=https://www.googleapis.com/auth/cloud-platform"
create_instance_cmd = """gcloud beta compute instances create {inst} --zone {zone} --image-family {source_image} --machine-type {machine_type} {network} {scopes} --boot-disk-size 50GB""".format(
inst=instance_name,
zone=customzedZone,
source_image=image,
machine_type=machine_type,
network=network_address_config,
scopes=scopes,
)
# print(create_instance_cmd)
# Run gcloud command to create machine.
proc = Popen(create_instance_cmd, stdout=PIPE, stderr=PIPE, shell=True)
# Wait for the process end and print error in case of failure
output, error = proc.communicate()
if proc.returncode != 0:
print(colored("Failed to create instance", color="red",
attrs=["bold"]))
print(colored("Error Response: ", color="blue", attrs=["bold"]),
output, error)
def del_instance_list(instance_list, zone="us-central1-a"):
for machine in instance_list:
print(colored("Deleting "+machine, "red", attrs=['bold']))
subprocess.Popen(
'gcloud -q compute instances delete {inst} --zone {zone}'.format(
inst=machine, zone=zone).split())
def stop_instance_list(instance_list, zone="us-central1-a"):
stop_cmd = 'gcloud compute instances stop {inst} --zone {zone}'.format(
inst=' '.join(instance_list), zone = zone
)
print(stop_cmd)
os.system(stop_cmd)
def start_instance_list(instance_list, zone="us-central1-a"):
start_cmd = 'gcloud compute instances start {inst} --zone {zone}'.format(
inst=' '.join(instance_list), zone = zone
)
print(start_cmd)
os.system(start_cmd)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('--num_replicas', type=int, default = 3,
help='Specify the number of replicas ')
parser.add_argument('--num_proxies', type=int, default = 2,
help='Specify the number of proxies ')
parser.add_argument('--num_clients', type=int, default = 10,
help='Specify the number of clients ')
args = parser.parse_args()
num_replicas = args.num_replicas
num_proxies = args.num_proxies
num_clients = args.num_clients
print("replicas: ", num_replicas)
print("proxies: ", num_proxies)
print("clients: ", num_clients)
# cfg_file_name = generate_ttcs_cfg_file("10.128.3.79", is_reference=True, use_ntp=False)
replica_ips = ["10.128.2."+str(i+10) for i in range(10)]
proxy_ips = ["10.128.2."+str(i+20) for i in range(10) ]
client_ips = ["10.128.2."+str(i+30) for i in range(100) ]
replica_ips = replica_ips[0:num_replicas]
proxy_ips = proxy_ips[0:num_proxies]
client_ips = client_ips[0:num_clients]
replica_name_list = [TAG+"-replica-"+str(i) for i in range(num_replicas) ]
proxy_name_list = [ TAG+"-proxy-"+str(i) for i in range(num_proxies) ]
client_name_list = [ TAG+"-client-"+str(i) for i in range(num_clients) ]
vm_ips = replica_ips + proxy_ips + client_ips
vm_name_list = replica_name_list + proxy_name_list + client_name_list
replica_vm_type = "n1-standard-16"
proxy_vm_type = "n1-standard-32"
client_vm_type = "n1-standard-4"
binary_path = "{login_path}/nezhav2/bazel-bin/".format(login_path = LOGIN_PATH)
config_path = "{login_path}/nezhav2/configs".format(login_path = LOGIN_PATH)
yaml = ruamel.yaml.YAML()
# for i in range(num_replicas):
# create_instance(instance_name = replica_name_list[i],
# image= "opensource-nezha",
# machine_type = replica_vm_type,
# customzedZone="us-central1-a",
# customzedIp = replica_ips[i] )
# print(colored("Created "+replica_name_list[i], "green", attrs=['bold']))
# exit(0)
# for i in range(num_proxies):
# create_instance(instance_name = proxy_name_list[i],
# image= "opensource-nezha",
# machine_type = proxy_vm_type,
# customzedZone="us-central1-a",
# customzedIp = proxy_ips[i] )
# print(colored("Created "+proxy_name_list[i], "green", attrs=['bold']))
# for i in range(num_clients):
# create_instance(instance_name = client_name_list[i],
# image= "opensource-nezha",
# machine_type = client_vm_type,
# customzedZone="us-central1-a",
# customzedIp = client_ips[i] )
# print(colored("Created "+client_name_list[i], "green", attrs=['bold']))
# time.sleep(120)
# for i in range(len(vm_ips)):
# start_ttcs_node(vm_ips[i],False)
# exit(0)
#### del_instance_list(instance_list=vm_name_list)
# stop_instance_list(instance_list = vm_name_list)
# exit(0)
# start_instance_list(instance_list = vm_name_list)
# time.sleep(60)
# print(vm_ips)
# launch_ttcs(vm_ips)
# exit(0)
# start_ttcs_node(replica_ips[3],False)
# exit(0)
test_no = 1
enable_dom =1
# enable_dom = 1
#poisson_rate = 10000
poisson_rate = 5000
percentile = 50
while len(replica_ips) < 5:
replica_ips += ["127.0.0.1"]
print(replica_ips)
for test_no in range(1,6):
for percentile in [50]: #[50,75,90,95]:
remote_path = "{login_path}/nezhav2/bazel-bin/*".format(login_path = LOGIN_PATH)
rm_cmd = "sudo rm -rf {remote_path}".format(remote_path=remote_path)
run_command(vm_ips, rm_cmd, in_background=False)
mkdir_cmd = "mkdir -p {binary_path}/micro-bench".format(binary_path = binary_path)
run_command(vm_ips, mkdir_cmd, in_background=False)
binary_file = "{binary_path}/micro-bench/bench_sender".format(binary_path=binary_path)
scp_files(vm_ips, binary_file, binary_file, to_remote = True)
binary_file = "{binary_path}/micro-bench/bench_receiver".format(binary_path=binary_path)
scp_files(vm_ips, binary_file, binary_file, to_remote = True)
# Kill existing procs
kill_cmd = "sudo pkill -9 bench_receiver"
run_command(vm_ips, kill_cmd, in_background=False)
kill_cmd = "sudo pkill -9 bench_sender"
run_command(vm_ips, kill_cmd, in_background=False)
rm_cmd = "sudo rm -rf Replica-Stats*.csv"
run_command(vm_ips, rm_cmd, in_background=False)
## Launch replicas (id starts from 0)
for i in range(num_replicas):
replica_cmd = "{binary_path}/micro-bench/bench_receiver --receiver_ip {ip} --replica_id {id} --enable_dom {enable_dom} --percentile {percentile} >{log_file} 2>&1 &".format(
binary_path = binary_path,
ip = replica_ips[i],
id = i,
enable_dom = enable_dom,
percentile = percentile,
log_file = "receiver-log-"+str(i)
)
print(colored(replica_cmd, "yellow", attrs=['bold']))
run_command([replica_ips[i]], replica_cmd, in_background=False)
# Launch clients (id starts from 2)
for i in range(num_clients):
client_cmd = "{binary_path}/micro-bench/bench_sender --receiver_1_ip {ip1} --receiver_2_ip {ip2} --receiver_3_ip {ip3} --receiver_4_ip {ip4} --receiver_5_ip {ip5} --receiver_num {receiver_num} --client_ip {myip} --poisson_rate {poisson_rate} --client_id {id} >{log_file} 2>&1 &".format(
binary_path = binary_path,
ip1 = replica_ips[0],
ip2 = replica_ips[1],
ip3 = replica_ips[2],
ip4 = replica_ips[3],
ip5 = replica_ips[4],
receiver_num = num_replicas,
myip = client_ips[i],
poisson_rate = poisson_rate,
id = i+1,
log_file = "client-log-"+str(i+1)
)
print(colored(client_cmd, "yellow", attrs=['bold']))
run_command([client_ips[i]], client_cmd, in_background = True)
# exit(0)
print("Sleep...")
time.sleep(90)
# Copy Stats File
folder_name = "micro-stats"
sub_folder_name = "T-{test_no}-{num_replicas}-{num_clients}-{poisson_rate}-{enable_dom}-{percentile}".format(
test_no = test_no,
num_replicas = num_replicas,
num_clients = num_clients,
poisson_rate = poisson_rate,
enable_dom = enable_dom,
percentile = percentile
)
stats_folder = "{login_path}/{folder_name}/{sub_folder_name}".format(
login_path = LOGIN_PATH,
folder_name = folder_name,
sub_folder_name = sub_folder_name
)
mkdir_cmd = "sudo mkdir -p -m 777 {stats_folder}".format(stats_folder = stats_folder)
os.system(mkdir_cmd)
for i in range(num_replicas):
file_name = "Replica-Stats-"+str(i)+".csv"
local_file_path = "{stats_folder}/{file_name}".format(
stats_folder = stats_folder,
file_name = file_name
)
remote_path = "{stats_folder}/{file_name}".format(
stats_folder = LOGIN_PATH,
file_name = file_name
)
scp_files([replica_ips[i]], local_file_path, remote_path, to_remote=False)
================================================
FILE: proto/BUILD
================================================
load("@rules_proto//proto:defs.bzl", "proto_library")
proto_library(
name = "nezha_proto",
srcs = ["nezha_proto.proto"],
visibility = ["//visibility:public"],
)
cc_proto_library(
name = "nezha_cc_proto",
deps = [":nezha_proto"],
visibility = ["//visibility:public"],
)
================================================
FILE: proto/nezha_proto.proto
================================================
syntax = "proto3";
package nezha.proto;
message Request {
uint64 sendtime = 1;
uint32 bound=2;
uint32 clientid = 3;
uint32 reqid = 4;
bytes command=5;
uint64 proxyid = 6;
uint32 key = 7;
bool iswrite = 8;
}
message RequestBodyMsg {
uint64 deadline = 1;
uint64 reqkey = 2;
uint64 proxyid = 3;
bytes command = 4;
uint32 key = 5;
bool iswrite = 6;
}
message TimeStats {
uint64 clienttime = 1;
uint64 proxytime = 2;
uint64 recvtime =3;
uint64 fastreplytime = 4;
uint64 slowreplytime= 5;
uint64 deadline = 6;
}
message Reply {
uint32 clientid = 1;
uint32 reqid = 2;
uint32 view = 3;
uint32 replicaid = 4;
bytes hash = 5;
bytes result = 6;
uint32 replytype = 7;
uint32 owd = 8;
uint32 maxsyncedlogid = 9; // This is the largest syncedlogid of my synced logs
uint32 logid = 10; // only set by the leader, it is the log id of the entry replied
bool iswrite = 11;
uint32 opkey = 12;
}
message IndexSync {
uint32 logidbegin = 1;
uint32 logidend = 2;
repeated uint64 deadlines = 3;
repeated uint64 reqkeys =4;
uint32 view = 5;
uint64 sendtime = 6;
}
message AskIndex {
uint32 logidbegin = 1;
uint32 logidend = 2;
uint32 replicaid = 3;
}
message AskReq {
repeated uint64 missedreqkeys = 1;
uint32 replicaid = 2;
}
message MissedReq {
repeated RequestBodyMsg reqs = 1;
uint32 replicaid = 2;
}
message ViewChangeRequest{
uint32 view = 1;
uint32 replicaid = 2;
repeated uint32 cv = 3;
}
message ViewChange {
uint32 view = 1;
uint32 replicaid = 2;
repeated uint32 cv = 3;
uint32 lastnormalview= 4;
// In the algo, we should include the logs in the viewchange msg
// But that is too large. As an implementation optimization, Let's use the following information, and later do state transfer to get the necessary entries
uint32 syncpoint = 5; // for synced logs: the max synced log id, no need to add syncbegin, because it is always CONCURRENT_MAP_START_IDX
uint32 unsynclogbegin = 6;
uint32 unsynclogend = 7;
}
message StateTransferRequest {
uint32 view = 1;
uint32 replicaid = 2;
bool issynced = 3;
uint32 logbegin = 4;
uint32 logend = 5;
}
message StateTransferReply {
uint32 view = 1;
uint32 replicaid = 2;
repeated uint32 cv = 3;
bool issynced = 4;
uint32 logbegin = 5;
uint32 logend = 6;
repeated RequestBodyMsg reqs = 7;
}
message StartView {
uint32 view = 1;
uint32 replicaid = 2;
repeated uint32 cv = 3;
uint32 syncedlogid = 4;
}
message CrashVectorRequest {
bytes nonce = 1;
uint32 replicaid = 2;
}
message CrashVectorReply {
bytes nonce = 1;
uint32 replicaid = 2;
repeated uint32 cv = 3;
}
message RecoveryRequest {
repeated uint32 cv = 1;
uint32 replicaid = 2;
}
message RecoveryReply {
uint32 view = 1;
repeated uint32 cv = 2;
uint32 replicaid = 3;
uint32 syncedlogid = 4;
}
message SyncStatusReport {
uint32 view = 1;
repeated uint32 cv = 2;
uint32 replicaid = 3;
uint32 syncedlogid = 4;
}
message CommitInstruction {
uint32 view = 1;
repeated uint32 cv = 2;
uint32 replicaid = 3;
uint32 committedlogid = 4;
}
================================================
FILE: proxy/BUILD
================================================
cc_library(
name = "proxy_config",
hdrs = ["proxy_config.h"],
deps = [
"@com_github_jbeder_yaml_cpp//:yaml-cpp",
],
)
cc_library(
name = "proxy_class",
srcs = ["proxy.cc"],
hdrs = ["proxy.h"],
deps = [
"//proto:nezha_cc_proto",
"//lib:utils",
"//lib:address",
":proxy_config",
],
)
cc_binary(
name = "nezha_proxy",
srcs = ["proxy_run.cc"],
deps = [
":proxy_class",
],
)
================================================
FILE: proxy/proxy.cc
================================================
#include "proxy/proxy.h"
namespace nezha {
Proxy::Proxy(const std::string& configFile) {
std::string error = proxyConfig_.parseConfig(configFile);
if (error != "") {
LOG(ERROR) << "Error parsing proxy config: " << error << "Exiting.";
exit(1);
}
CreateContext();
}
void Proxy::Terminate() {
LOG(INFO) << "Terminating...";
running_ = false;
}
void Proxy::Run() {
running_ = true;
LaunchThreads();
for (auto& kv : threadPool_) {
LOG(INFO) << "Join " << kv.first;
kv.second->join();
LOG(INFO) << "Join Complete " << kv.first;
}
LOG(INFO) << "Run Terminated ";
}
Proxy::~Proxy() {
for (auto& kv : threadPool_) {
delete kv.second;
}
for (uint32_t i = 0; i < replicaAddrs_.size(); i++) {
for (uint32_t j = 0; j < replicaAddrs_[0].size(); j++) {
if (replicaAddrs_[i][j]) {
delete replicaAddrs_[i][j];
}
}
}
// Clear Context (free memory)
ConcurrentMap::Iterator clientIter(
clientAddrs_);
while (clientIter.isValid()) {
if (clientIter.getValue()) {
delete clientIter.getValue();
}
clientIter.next();
}
// for (uint32_t i = 0; i < committedReplyMap_.size(); i++) {
// ConcurrentMap& committedReply = committedReplyMap_[i];
// ConcurrentMap::Iterator iter(committedReply);
// while (iter.isValid()) {
// Reply* reply = iter.getValue();
// if (reply) {
// delete reply;
// }
// iter.next();
// }
// }
}
int Proxy::CreateSocketFd(const std::string& sip, const int sport) {
int fd = socket(PF_INET, SOCK_DGRAM, 0);
if (fd < 0) {
LOG(ERROR) << "Receiver Fd fail ";
return -1;
}
// Set Non-Blocking
int status = fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_NONBLOCK);
if (status < 0) {
LOG(ERROR) << " Set NonBlocking Fail";
return -1;
}
if (sip != "") {
struct sockaddr_in addr;
bzero(&addr, sizeof(addr));
addr.sin_family = AF_INET;
addr.sin_port = htons(sport);
addr.sin_addr.s_addr = inet_addr(sip.c_str());
// Bind socket to Address
int bindRet = bind(fd, (struct sockaddr*)&addr, sizeof(addr));
if (bindRet != 0) {
LOG(ERROR) << "bind error\t" << bindRet;
return -1;
}
}
return fd;
}
void Proxy::LaunchThreads() {
int shardNum = proxyConfig_.proxyShardNum;
threadPool_["CalcLatencyBound"] =
new std::thread(&Proxy::CalculateLatencyBoundTd, this);
for (int i = 0; i < shardNum; i++) {
std::string key = "CheckQuorumTd-" + std::to_string(i);
threadPool_[key] = new std::thread(&Proxy::CheckQuorumTd, this, i);
}
for (int i = 0; i < shardNum; i++) {
std::string key = "ForwardRequestsTd-" + std::to_string(i);
threadPool_[key] = new std::thread(&Proxy::ForwardRequestsTd, this, i);
}
// std::string key = "LogTd";
// threadPool_[key] = new std::thread(&Proxy::LogTd, this);
}
void Proxy::CalculateLatencyBoundTd() {
std::pair owdSample;
std::vector replicaOWDs;
replicaOWDs.resize(proxyConfig_.replicaIps.size(),
proxyConfig_.replicaInitialOwd);
for (uint32_t i = 0; i < replicaOWDs.size(); i++) {
LOG(INFO) << "replicaOWD " << i << "\t" << replicaOWDs[i];
}
while (running_) {
while (owdQu_.try_dequeue(owdSample)) {
VLOG(1) << "replica=" << owdSample.first << "\towd=" << owdSample.second;
replicaOWDs[owdSample.first] = owdSample.second;
// Update latency bound
uint32_t estimatedOWD = 0;
for (uint32_t i = 0; i < replicaOWDs.size(); i++) {
if (estimatedOWD < replicaOWDs[i]) {
estimatedOWD = replicaOWDs[i];
}
}
if (estimatedOWD > maxOWD_) {
estimatedOWD = maxOWD_;
}
latencyBound_.store(estimatedOWD);
VLOG(1) << "Update bound " << latencyBound_;
}
usleep(5000);
}
}
void Proxy::LogTd() {
Log litem;
std::ofstream ofs("Proxy-Stats-" + std::to_string(proxyConfig_.proxyId) +
".csv");
ofs << "ReplicaId,ClientId,RequestId,ClientTime,ProxyTime,"
"ProxyEndProcessTime,RecvTime,Deadline,"
"FastReplyTime,"
"SlowReplyTime,"
"ProxyRecvTime,CommitType"
<< std::endl;
uint32_t logCnt = 0;
while (running_) {
if (logQu_.try_dequeue(litem)) {
ofs << litem.ToString() << std::endl;
logCnt++;
if (logCnt % 10000 == 0) {
ofs.flush();
}
}
}
}
void Proxy::CheckQuorumTd(const int id) {
// ConcurrentMap& committedReply = committedReplyMap_[id];
std::unordered_map& committedReply = committedReplyMap_[id];
ConcurrentMap& logs = logMap_[id];
std::map> replyQuorum;
std::map uncommittedReply; // Key: logId, value: reqKey
uint32_t currentView = 0;
int sz = 0;
char buffer[UDP_BUFFER_SIZE];
MessageHeader* msgHdr = (MessageHeader*)(void*)buffer;
struct sockaddr_in recvAddr;
socklen_t sockLen = sizeof(recvAddr);
Reply reply;
Reply* committedAck = NULL;
uint32_t replyNum = 0;
uint64_t startTime, endTime;
std::vector& replicaSyncedPoint = replicaSyncedPoints_[id];
while (running_) {
if ((sz = recvfrom(forwardFds_[id], buffer, UDP_BUFFER_SIZE, 0,
(struct sockaddr*)(&recvAddr), &sockLen)) > 0) {
if ((uint32_t)sz < sizeof(MessageHeader) ||
(uint32_t)sz < msgHdr->msgLen + sizeof(MessageHeader)) {
continue;
}
if (reply.ParseFromArray(buffer + sizeof(MessageHeader),
msgHdr->msgLen)) {
uint64_t reqKey = CONCAT_UINT32(reply.clientid(), reply.reqid());
if (reply.owd() > 0) {
owdQu_.enqueue(
std::pair(reply.replicaid(), reply.owd()));
}
uint64_t syncPoint =
CONCAT_UINT32(reply.view(), reply.maxsyncedlogid());
if (replicaSyncedPoint[reply.replicaid()] < syncPoint) {
replicaSyncedPoint[reply.replicaid()] = syncPoint;
}
if (reply.clientid() == 0 && reply.reqid() == 0) {
// Dummy reply, just used to update
continue;
}
// committedAck = committedReply.get(reqKey);
// if (committedAck != NULL) {
// // already committed; ignore
// continue;
// }
auto iter = committedReply.find(reqKey);
if (iter != committedReply.end()) {
// already committed; ignore
continue;
}
if (reply.view() < currentView) {
LOG(INFO) << "Replied from old view";
continue;
}
if (currentView < reply.view()) {
// Replicas have upgraded to a new view
// Reset current state
currentView = reply.view();
uncommittedReply.clear();
replyQuorum.clear();
for (int i = 0; i < replicaNum_; i++) {
replicaSyncedPoint[i] = replicaSyncedPoint[reply.replicaid()];
}
currentView = reply.view();
}
// LOG(INFO) << reply.DebugString();
if (reply.replytype() == (uint32_t)MessageType::COMMIT_REPLY) {
committedAck = new Reply(reply);
// committedReply.assign(reqKey, committedAck);
} else if (replyQuorum[reqKey].find(reply.replicaid()) ==
replyQuorum[reqKey].end()) {
replyQuorum[reqKey][reply.replicaid()] = reply;
committedAck = isQuorumReady(replicaSyncedPoint, replyQuorum[reqKey]);
} else if (reply.view() > replyQuorum[reqKey].begin()->second.view()) {
// New view has come, clear existing replies for this request
uncommittedReply.clear();
replyQuorum[reqKey].clear();
replyQuorum[reqKey][reply.replicaid()] = reply;
for (int i = 0; i < replicaNum_; i++) {
replicaSyncedPoint[i] = replicaSyncedPoint[reply.replicaid()];
}
committedAck = isQuorumReady(replicaSyncedPoint, replyQuorum[reqKey]);
} else if (reply.view() == replyQuorum[reqKey].begin()->second.view()) {
const Reply& existedReply = replyQuorum[reqKey][reply.replicaid()];
if (existedReply.view() < reply.view()) {
replyQuorum[reqKey][reply.replicaid()] = reply;
} else if (existedReply.view() == reply.view() &&
existedReply.replytype() < reply.replytype()) {
// FAST_REPLY < SLOW_REPLY < COMMIT_REPLY
replyQuorum[reqKey][reply.replicaid()] = reply;
}
committedAck = isQuorumReady(replicaSyncedPoint, replyQuorum[reqKey]);
} // else: reply.view()< replyQuorum[reqKey].begin()->second.view(),
// ignore it
if (committedAck != NULL && committedAck->replytype() > 0) {
// Ack to client
struct sockaddr_in* clientAddr =
clientAddrs_.get(committedAck->clientid());
std::string replyMsg = committedAck->SerializeAsString();
msgHdr->msgType = MessageType::COMMIT_REPLY;
msgHdr->msgLen = replyMsg.length();
memcpy(buffer + sizeof(MessageHeader), replyMsg.c_str(),
replyMsg.length());
sendto(replyFds_[id], buffer,
replyMsg.length() + sizeof(MessageHeader), 0,
(struct sockaddr*)clientAddr, sizeof(sockaddr));
// Add to cache
// committedReply.assign(reqKey, committedAck);
committedReply[reqKey] = committedAck;
replyQuorum.erase(reqKey);
// Disable Log
// Log* litem = logs.get(reqKey);
// if (litem) {
// litem->proxyRecvTime_ = GetMicrosecondTimestamp();
// litem->commitType_ = committedAck->replytype();
// logQu_.enqueue(*litem);
// }
// Check whether some uncommittedReply can be committed
while ((!uncommittedReply.empty()) &&
uncommittedReply.begin()->first <= committedAck->logid()) {
Reply* ack = uncommittedReply.begin()->second;
ack->set_replytype(MessageType::COMMIT_REPLY);
if (uncommittedReply.begin()->first < committedAck->logid()) {
const Reply* ack = uncommittedReply.begin()->second;
struct sockaddr_in* clientAddr =
clientAddrs_.get(ack->clientid());
std::string replyMsg = ack->SerializeAsString();
msgHdr->msgType = MessageType::COMMIT_REPLY;
msgHdr->msgLen = replyMsg.length();
memcpy(buffer + sizeof(MessageHeader), replyMsg.c_str(),
replyMsg.length());
sendto(replyFds_[id], buffer,
replyMsg.length() + sizeof(MessageHeader), 0,
(struct sockaddr*)clientAddr, sizeof(sockaddr));
}
uint64_t reqKey = CONCAT_UINT32(ack->clientid(), ack->reqid());
// committedReply.assign(reqKey, ack);
committedReply[reqKey] = ack;
replyQuorum.erase(reqKey);
uncommittedReply.erase(uncommittedReply.begin());
delete ack;
}
// LOG(INFO) << "reqId=" << committedAck->reqid()
// << "\t type=" << committedAck->replytype();
// replyNum++;
// if (replyNum == 1) {
// startTime = GetMicrosecondTimestamp();
// } else if (replyNum % 100000 == 0) {
// endTime = GetMicrosecondTimestamp();
// float rate = 100000 / ((endTime - startTime) * 1e-6);
// LOG(INFO) << "id=" << id << "\t"
// << "replyNum=" << replyNum << "\t"
// << "rate = " << rate << "\t"
// << "uncommittedLen = " << uncommittedReply.size();
// startTime = endTime;
// }
} else if (committedAck != NULL && committedAck->replytype() == 0) {
// record in uncommittedRequests
if (committedAck->replicaid() == currentView % replicaNum_) {
// This is a leader's reply, cache it
if (uncommittedReply.find(committedAck->logid()) ==
uncommittedReply.end()) {
uncommittedReply[committedAck->logid()] = committedAck;
}
} else {
delete committedAck;
}
}
}
}
}
}
Reply* Proxy::isQuorumReady(std::vector& replicaSyncedPoint,
std::map& quorum) {
// These replies are of the same view for sure (we have previously forbidden
// inconsistency)
uint32_t view = quorum.begin()->second.view();
uint32_t leaderId = view % replicaNum_;
if (quorum.find(leaderId) == quorum.end()) {
return NULL;
}
Reply& leaderReply = quorum[leaderId];
uint32_t fastOrSlowReplyNum = 0; // slowReply can be used as fastReply
uint32_t slowReplyNum = 0; // But fastReply cannot be used as slowReply
for (const auto& kv : quorum) {
bool fastSatisfied = (kv.second.replytype() == MessageType::FAST_REPLY &&
kv.second.view() == leaderReply.view() &&
kv.second.hash() == leaderReply.hash());
bool slowSatisfied =
(HIGH_32BIT(replicaSyncedPoint[kv.first]) == leaderReply.view() &&
LOW_32BIT(replicaSyncedPoint[kv.first]) >= leaderReply.logid());
// if (kv.second.replytype() == MessageType::FAST_REPLY &&
// kv.second.hash() != leaderReply.hash()) {
// LOG(INFO) << kv.second.DebugString()
// << "\t\t\nLeader: " << leaderReply.DebugString();
// }
if (fastSatisfied || slowSatisfied) {
fastOrSlowReplyNum++;
}
if (slowSatisfied) {
slowReplyNum++;
}
// if( (!fastSatisfied) && (!slowSatisfied) && quorum.size()==3) {
// LOG(INFO) <<"Wrong "<= leaderReply.logid()) ;
// }
}
Reply* committedReply = new Reply(leaderReply);
if (fastOrSlowReplyNum >= (uint32_t)fastQuorum_) {
// Fast Commit
committedReply->set_replytype(MessageType::FAST_REPLY);
} else if (slowReplyNum >= (uint32_t)f_ + 1) {
// Slow Commit: Together with the leader reply, it forms the simple quorum
// of f+1
committedReply->set_replytype(MessageType::SLOW_REPLY);
} else {
// Uncommitted
// if(quorum.size()==3) {
// LOG(INFO) <<"fastOrSlowReplyNum="<set_replytype(0);
}
return committedReply;
}
void Proxy::ForwardRequestsTd(const int id) {
// ConcurrentMap& committedReply = committedReplyMap_[id];
ConcurrentMap& logs = logMap_[id];
char buffer[UDP_BUFFER_SIZE];
MessageHeader* msgHdr = (MessageHeader*)(void*)buffer;
int sz = -1;
struct sockaddr_in receiverAddr;
socklen_t len = sizeof(receiverAddr);
Request request;
uint32_t forwardCnt = 0;
uint64_t startTime, endTime;
while (running_) {
if ((sz = recvfrom(requestReceiveFds_[id], buffer, UDP_BUFFER_SIZE, 0,
(struct sockaddr*)&receiverAddr, &len)) > 0) {
if ((uint32_t)sz < sizeof(MessageHeader) ||
(uint32_t)sz < msgHdr->msgLen + sizeof(MessageHeader)) {
continue;
}
if (msgHdr->msgType == MessageType::CLIENT_REQUEST &&
request.ParseFromArray(buffer + sizeof(MessageHeader),
msgHdr->msgLen)) {
uint64_t reqKey = CONCAT_UINT32(request.clientid(), request.reqid());
request.set_bound(latencyBound_);
request.set_proxyid(proxyIds_[id]);
request.set_sendtime(GetMicrosecondTimestamp());
std::string msg = request.SerializeAsString();
msgHdr->msgType = MessageType::CLIENT_REQUEST;
msgHdr->msgLen = msg.length();
memcpy(buffer + sizeof(MessageHeader), msg.c_str(), msg.length());
if (clientAddrs_.get(request.clientid()) == NULL) {
struct sockaddr_in* addr = new sockaddr_in(receiverAddr);
clientAddrs_.assign(request.clientid(), addr);
}
// Send to every replica
for (int i = 0; i < replicaNum_; i++) {
// uint32_t generateProxyId = (uint32_t)(proxyIds_[id] >> 32u);
// struct sockaddr_in* replicaAddr =
// replicaAddrs_[i][generateProxyId % replicaAddrs_[i].size()];
struct sockaddr_in* replicaAddr =
replicaAddrs_[i][proxyIds_[id] % replicaAddrs_[i].size()];
sendto(forwardFds_[id], buffer,
msgHdr->msgLen + sizeof(MessageHeader), 0,
(struct sockaddr*)replicaAddr, sizeof(sockaddr_in));
}
// Log* litem = new Log();
// litem->clientId_ = request.clientid();
// litem->reqId_ = request.reqid();
// litem->clientTime_ = request.clienttime();
// litem->proxyTime_ = request.sendtime();
// litem->deadline_ = request.sendtime() + request.bound();
// logs.assign(reqKey, litem);
// litem->proxyEndProcessTime_ = GetMicrosecondTimestamp();
// LOG(INFO) << "id=" << id << "\t"
// << "cid=" << request.clientid() << "\t" << request.reqid();
// forwardCnt++;
// if (forwardCnt == 1) {
// startTime = GetMicrosecondTimestamp();
// } else if (forwardCnt % 100 == 0) {
// endTime = GetMicrosecondTimestamp();
// float rate = 100 / ((endTime - startTime) * 1e-6);
// LOG(INFO) << "Forward-Id=" << id << "\t"
// << "count =" << forwardCnt << "\t"
// << "rate=" << rate << " req/sec"
// << "\t"
// << "req is <" << request.clientid() << ","
// << request.reqid() << ">";
// startTime = endTime;
// }
}
}
}
}
void Proxy::CreateContext() {
running_ = true;
int shardNum = proxyConfig_.proxyShardNum;
uint32_t proxyId = proxyConfig_.proxyId;
forwardFds_.resize(shardNum, -1);
requestReceiveFds_.resize(shardNum, -1);
replyFds_.resize(shardNum, -1);
proxyIds_.resize(shardNum, proxyId);
latencyBound_ = proxyConfig_.replicaInitialOwd;
maxOWD_ = proxyConfig_.proxyMaxOwd;
for (int i = 0; i < shardNum; i++) {
forwardFds_[i] = CreateSocketFd(proxyConfig_.proxyIp,
proxyConfig_.proxyReplyPortBase + i);
requestReceiveFds_[i] = CreateSocketFd(
proxyConfig_.proxyIp, proxyConfig_.proxyRequestPortBase + i);
replyFds_[i] = CreateSocketFd("", -1);
proxyIds_[i] = ((proxyIds_[i] << 32) | (uint32_t)i);
}
committedReplyMap_.resize(shardNum);
logMap_.resize(shardNum);
replicaNum_ = proxyConfig_.replicaIps.size();
assert(replicaNum_ % 2 == 1);
f_ = replicaNum_ / 2;
replicaSyncedPoints_.resize(shardNum);
for (int i = 0; i < shardNum; i++) {
replicaSyncedPoints_[i].assign(replicaNum_, CONCURRENT_MAP_START_INDEX);
}
fastQuorum_ = (f_ % 2 == 1) ? (f_ + (f_ + 1) / 2 + 1) : (f_ + f_ / 2 + 1);
replicaAddrs_.resize(replicaNum_);
for (int i = 0; i < replicaNum_; i++) {
std::string replicaIP = proxyConfig_.replicaIps[i];
for (int j = 0; j < proxyConfig_.replicaReceiverShards; j++) {
struct sockaddr_in* addr = new sockaddr_in();
bzero(addr, sizeof(struct sockaddr_in));
addr->sin_family = AF_INET;
addr->sin_port = htons(proxyConfig_.replicaReceiverPort + j);
addr->sin_addr.s_addr = inet_addr(replicaIP.c_str());
replicaAddrs_[i].push_back(addr);
}
}
}
} // namespace nezha
================================================
FILE: proxy/proxy.h
================================================
#include
#include
#include "lib/utils.h"
#include "proto/nezha_proto.pb.h"
#include "proxy_config.h"
namespace nezha {
using namespace nezha::proto;
/**
* Refer to proxy_run.cc, the runnable program only needs to instantiate a
* Proxy object with a configuration file. Then it calls Run() method to run
* and calls Terminate() method to stop
*/
class Proxy {
private:
/** All the configuration parameters for this proxy are included in
* proxyConfig_*/
ProxyConfig proxyConfig_;
/** Each thread is given a unique name (key) */
std::map threadPool_;
/** Launch all the threads, these threads are mainly categorized into three
* classes:
* (1) ForwardRequestsTd, which receives client requests and
* multicast to replicas;
* (2) CheckQuorumTd, which receives replica replies and
* check whether the corresponding request has been committed (use
* isQuorumReady), if so, send a reply to the client;
* (3) CalculateLatencyBoundTd, which caluldates the latency bound
*
* (1) and (2) handles most workload and is parallelized, and the parallism
* degree is decided by the parameter defined in proxyConfig_ (i.e.,
* shard-num).
*
* (1) and (2) are paired, i.e., we launch equal number of
* ForwardRequestsTds and CheckQuorumTds. The requests multicast by
* ForwardRequestsTd-i will be tracked and quorum-checked by CheckQuorumTd-i
*/
void LaunchThreads();
void ForwardRequestsTd(const int id = -1);
void CheckQuorumTd(const int id = -1);
void CalculateLatencyBoundTd();
/** LogTd is just used to collect some performance stats. It is not necessary
* in the release version */
void LogTd();
/** Create/Initialize all the necessary variables */
void CreateContext();
/** Check whether a quorum has been formed for the request to be committed.
* If the request has been committed, it returns the reply message, which will
* be delievered to the client; otherwise, it returns NULL
*/
Reply* isQuorumReady(std::vector& repliedSyncPoint,
std::map& quorum);
/** Tools function: given ip and port, create a socket fd. If ip is not empty,
* the socket will be binded to the */
int CreateSocketFd(const std::string& ip = "", const int port = -1);
/** Flag to Run/Terminate threads */
std::atomic running_;
/** Each CheckQuorumTd thread uses the socket fd in replyFds_, based on its
* id, to send reply to clients
*/
std::vector replyFds_;
/** Each ForwardRequestsTd thread uses the socket fd in forwardFds_, based on
* its id, to multicast requests to replicas
*/
std::vector forwardFds_;
/** Each ForwardRequestsTd thread uses the socket fd in requestReceiveFds_,
* based on its id, to receive requests from clients
*/
std::vector requestReceiveFds_;
/** We create a unique id for each ForwardRequestsTd, so that replicas can
* derive which CheckQuorumTd should receive the reply messages */
std::vector proxyIds_;
/** CalculateLatencyBoundTd updates latencyBound_ and concurrently
* ForwardRequestsTds read it and included in request messages */
std::atomic latencyBound_;
/** Upper bound of the estimated latencyBound_, used to clamp the bound,
* details in ``Adapative Latency Bound`` para of Sec 4 of our paper */
uint32_t maxOWD_;
/** CheckQuorumTd threads pass samples to
* CalculateLatencyBoundTd */
ConcurrentQueue> owdQu_; //
int replicaNum_;
int f_; /** replicaNum_ =2f_+1 */
int fastQuorum_; /** fastQuorum_ = f_+ceiling(f_/2)+1 */
/** Just used to collect logs, can be deleted in the release version*/
struct Log {
uint32_t replicaId_;
uint32_t clientId_;
uint32_t reqId_;
uint64_t clientTime_;
uint64_t proxyTime_;
uint64_t proxyEndProcessTime_;
uint64_t recvTime_;
uint64_t deadline_;
uint64_t fastReplyTime_;
uint64_t slowReplyTime_;
uint64_t proxyRecvTime_;
uint32_t commitType_;
Log(uint32_t rid = 0, uint32_t cId = 0, uint32_t reqId = 0,
uint64_t ctime = 0, uint64_t ptime = 0, uint64_t pedtime = 0,
uint64_t rtime = 0, uint64_t ddl = 0, uint64_t fttime = 0,
uint64_t swtime = 0, uint64_t prcvt = 0, uint32_t cmtt = 0)
: replicaId_(rid),
clientId_(cId),
reqId_(reqId),
clientTime_(ctime),
proxyTime_(ptime),
recvTime_(rtime),
deadline_(ddl),
fastReplyTime_(fttime),
slowReplyTime_(swtime),
proxyRecvTime_(prcvt),
commitType_(cmtt) {}
std::string ToString() {
return std::to_string(replicaId_) + "," + std::to_string(clientId_) +
"," + std::to_string(reqId_) + "," + std::to_string(clientTime_) +
"," + std::to_string(proxyTime_) + "," +
std::to_string(proxyEndProcessTime_) + "," +
std::to_string(recvTime_) + "," + std::to_string(deadline_) + "," +
std::to_string(fastReplyTime_) + "," +
std::to_string(slowReplyTime_) + "," +
std::to_string(proxyRecvTime_) + "," + std::to_string(commitType_);
}
};
ConcurrentQueue logQu_;
/** Vector of replica's addresses
* Since replicas can have multiple receiver shards, we use a two-dimensional
* vector.
*
* replicaAddrs_[i] records the addresses of replica-i, which can receive
* requests replicaAddrs_[i][j] is the address of the jth receiver shard of
* replica-i.
*/
std::vector> replicaAddrs_;
/**
* After ForwardRequestTd receives client request, it records the address of
* the client, so that later the correspoinding CheckQuorumTd can know which
* address should recieve the commit reply.
*/
ConcurrentMap clientAddrs_;
/**
* As an optimization, proxies also mantain a cache to record the commit reply
* messages for those already-commited requests. In this way, when clients
* retry the request which has already been committed, the proxy can direct
* resend the reply, instead of adding additional burden to the replicas
*/
std::vector> committedReplyMap_;
std::vector> sendTimeMap_;
std::vector> logMap_;
public:
/** Proxy accept a config file, which contains all the necessary information
* to instantiate the object, then it can call Run method
* */
Proxy(const std::string& configFile = "../configs/nezha-proxy-config.yaml");
~Proxy();
void Run();
void Terminate();
/** Tentative */
std::vector> replicaSyncedPoints_;
};
} // namespace nezha
================================================
FILE: proxy/proxy_config.h
================================================
#include
#include
#include
#include
#include
struct ProxyConfig {
int proxyId;
std::string proxyIp;
int proxyShardNum;
uint32_t proxyMaxOwd;
int proxyRequestPortBase;
int proxyReplyPortBase;
std::vector replicaIps;
uint32_t replicaInitialOwd;
int replicaReceiverPort;
int replicaReceiverShards;
// Parses yaml file configFilename and fills in fields of ProxyConfig
// accordingly. Returns an error message or "" if there are no errors.
std::string parseConfig(std::string configFilename) {
YAML::Node config;
try {
config = YAML::LoadFile(configFilename);
} catch (const YAML::BadFile& e) {
return "Error loading config file:" + e.msg + ".";
}
LOG(INFO) << "Using config:\n " << config;
std::string key; // Keep track of current key for better error messages
try {
key = "replica-ips";
for (uint32_t i = 0; i < config[key].size(); i++) {
replicaIps.push_back(config[key][i].as());
}
key = "replica-receiver-shards";
replicaReceiverShards = config[key].as();
key = "replica-initial-owd";
replicaInitialOwd = config[key].as();
key = "replica-receiver-port";
replicaReceiverPort = config[key].as();
key = "proxy-id";
proxyId = config[key].as();
key = "proxy-ip";
proxyIp = config[key].as();
key = "proxy-shard-num";
proxyShardNum = config[key].as();
key = "proxy-max-owd";
proxyMaxOwd = config[key].as();
key = "proxy-request-port-base";
proxyRequestPortBase = config[key].as();
key = "proxy-reply-port-base";
proxyReplyPortBase = config[key].as();
return "";
} catch (const YAML::BadConversion& e) {
if (config[key]) {
return "Error parsing config field " + key + ": " + e.msg + ".";
} else {
return "Error parsing config field " + key + ": key not found.";
}
} catch (const std::exception& e) {
return "Error parsing config field " + key + ": " + e.what() + ".";
}
}
};
================================================
FILE: proxy/proxy_run.cc
================================================
#include "proxy/proxy.h"
DEFINE_string(config, "nezhav2/config/nezha-proxy-config-0.yaml", "The config file for the proxy");
nezha::Proxy* proxy = NULL;
void Terminate(int para) {
proxy->Terminate();
}
int main(int argc, char* argv[]) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
google::InitGoogleLogging(argv[0]);
FLAGS_logtostderr = 1;
signal(SIGINT, Terminate);
proxy = new nezha::Proxy(FLAGS_config);
proxy->Run();
delete proxy;
}
================================================
FILE: replica/BUILD
================================================
cc_library(
name = "replica_config",
hdrs = ["replica_config.h"],
deps = [
"@com_github_jbeder_yaml_cpp//:yaml-cpp",
],
)
cc_library(
name = "replica_class",
srcs = ["replica.cc"],
hdrs = [
"replica.h",
],
deps = [
":replica_config",
"//proto:nezha_cc_proto",
"//lib:utils",
"@com_github_preshing_junction//:libjunction",
"@com_github_enki_libev//:libev",
"@boost//:uuid",
],
)
cc_binary(
name = "nezha_replica",
srcs = ["replica_run.cc"],
deps = [
":replica_class",
],
)
================================================
FILE: replica/replica.cc
================================================
#include "replica/replica.h"
namespace nezha {
// #define GJK_DEBUG
#ifdef GJK_DEBUG
#define ASSERT(x) assert(x)
#else
#define ASSERT(x) \
{}
#endif
Replica::Replica(const std::string& configFile, bool isRecovering)
: viewId_(0), lastNormalView_(0) {
repliedSyncPoint_ = new std::atomic[maxProxyNum_];
for (uint32_t i = 0; i < maxProxyNum_; i++) {
repliedSyncPoint_[i] = CONCURRENT_MAP_START_INDEX - 1;
}
LOG(INFO) << maxProxyNum_ << " proxy replied sync point has been initialized";
lastAskMissedIndexTime_ = 0;
lastAskMissedRequestTime_ = 0;
syncedLogEntryHead_ = new LogEntry();
syncedLogEntryHead_->logId = CONCURRENT_MAP_START_INDEX - 1;
syncedLogEntryHead_->body.deadline = 0;
syncedLogEntryHead_->body.reqKey = 0;
unSyncedLogEntryHead_ = new LogEntry();
unSyncedLogEntryHead_->logId = CONCURRENT_MAP_START_INDEX - 1;
unSyncedLogEntryHead_->body.deadline = 0;
unSyncedLogEntryHead_->body.reqKey = 0;
// Load Config
std::string error = replicaConfig_.parseConfig(configFile);
if (error != "") {
LOG(ERROR) << "Error loading replica config. " << error << " Exiting";
exit(1);
}
if (isRecovering) {
status_ = ReplicaStatus::RECOVERING;
LOG(INFO) << "Recovering ...";
} else {
status_ = ReplicaStatus::NORMAL;
}
LOG(INFO) << "Replica Status " << status_;
CreateContext();
LOG(INFO) << "viewId_=" << viewId_ << "\treplicaId=" << replicaId_
<< "\treplicaNum=" << replicaNum_ << "\tkeyNum=" << keyNum_;
}
Replica::~Replica() {
status_ = ReplicaStatus::TERMINATED;
for (auto& kv : threadPool_) {
delete kv.second;
VLOG(2) << "Deleted\t" << kv.first;
}
// TODO: A more elegant way is to reclaim or dump all logs before exit
// For now, it is fine because all the memory is freed after the process is
// terminated
}
void Replica::Run() {
// Master thread run
masterContext_->Register(endPointType_);
if (status_ == ReplicaStatus::RECOVERING) {
masterContext_->endPoint_->RegisterTimer(crashVectorRequestTimer_);
} else if (status_ == ReplicaStatus::NORMAL) {
if (!AmLeader()) {
masterContext_->endPoint_->RegisterTimer(heartbeatCheckTimer_);
}
masterContext_->endPoint_->RegisterTimer(periodicSyncTimer_);
}
// Launch worker threads (based on config)
LaunchThreads();
masterContext_->endPoint_->LoopRun();
VLOG(2) << "Break LoopRun";
// Wait until all threads return
for (auto& kv : threadPool_) {
VLOG(2) << "Joining " << kv.first;
kv.second->join();
VLOG(2) << "Join Complete \t" << kv.first;
}
}
void Replica::Terminate() {
do {
status_ = ReplicaStatus::TERMINATED;
waitVar_.notify_all();
// LOG(INFO) << "activeWorkerNum_=" << activeWorkerNum_;
} while (activeWorkerNum_ > 0);
}
void Replica::CreateContext() {
endPointType_ = replicaConfig_.endpointType;
replicaId_ = replicaConfig_.replicaId;
replicaNum_ = replicaConfig_.replicaIps.size();
keyNum_ = replicaConfig_.keyNum;
lastReleasedEntryByKeys_.assign(keyNum_, {0ul, 0ul});
// Since ConcurrentMap reserves 0 and 1, log-id starts from from 2
// So these variables are initialized as 2-1=1
maxSyncedLogEntry_ = syncedLogEntryHead_;
maxUnSyncedLogEntry_ = unSyncedLogEntryHead_;
minUnSyncedLogEntry_ = unSyncedLogEntryHead_;
maxSyncedLogEntryByKey_.assign(keyNum_, NULL);
maxUnSyncedLogEntryByKey_.assign(keyNum_, NULL);
minUnSyncedLogEntryByKey_.assign(keyNum_, NULL);
committedLogId_ = CONCURRENT_MAP_START_INDEX - 1;
toCommitLogId_ = CONCURRENT_MAP_START_INDEX - 1;
// Create master endpoints and context
std::string ip = replicaConfig_.replicaIps[replicaId_.load()];
int port = replicaConfig_.masterPort;
int monitorPeriodMs = replicaConfig_.monitorPeriodMs;
Endpoint* masterEP = CreateEndpoint(endPointType_, ip, port, true);
auto masterCallBack = [](MessageHeader* msgHeader, char* msgBuffer,
Address* sender, void* ctx) {
((Replica*)ctx)->ReceiveMasterMessage(msgHeader, msgBuffer);
};
// Register a timer to monitor replica status
Timer* masterMonitorTimer = new Timer(
[](void* ctx, void* receiverEP) {
if (((Replica*)ctx)->status_ == ReplicaStatus::TERMINATED) {
// Master thread will only break its loop when status comes to
// TERMINATED
((Endpoint*)receiverEP)->LoopBreak();
}
},
monitorPeriodMs, this);
masterContext_ =
new ReceiverContext(masterEP, this, masterCallBack, masterMonitorTimer);
LOG(INFO) << "Master Created";
// Create request-receiver endpoints and context
requestContext_.resize(replicaConfig_.receiverShards);
for (int i = 0; i < replicaConfig_.receiverShards; i++) {
int port = replicaConfig_.receiverPort + i;
Endpoint* requestEP = CreateEndpoint(endPointType_, ip, port);
// Register a request handler to this endpoint
auto requestHandlerFunc = [](MessageHeader* msgHeader, char* msgBuffer,
Address* sender, void* ctx) {
((Replica*)ctx)->ReceiveClientRequest(msgHeader, msgBuffer, sender);
};
// Register a timer to monitor replica status
Timer* requestEPMonitorTimer = new Timer(
[](void* ctx, void* receiverEP) {
if (((Replica*)ctx)->status_ != ReplicaStatus::NORMAL) {
((Endpoint*)receiverEP)->LoopBreak();
}
},
monitorPeriodMs, this);
requestContext_[i] = new ReceiverContext(
requestEP, this, requestHandlerFunc, requestEPMonitorTimer);
}
LOG(INFO) << "requestContext_ Created";
// (Leader) Use these endpoints to broadcast indices to followers
for (int i = 0; i < replicaConfig_.indexSyncShards; i++) {
indexSender_.push_back(new UDPSocketEndpoint());
}
indexAcker_ = CreateEndpoint(endPointType_);
indexRequester_ = CreateEndpoint(endPointType_);
reqRequester_ = CreateEndpoint(endPointType_);
for (uint32_t i = 0; i < replicaNum_; i++) {
std::string ip = replicaConfig_.replicaIps[i];
int indexPort = replicaConfig_.indexSyncPort;
indexReceiver_.push_back(new Address(ip, indexPort));
int indexAskPort = replicaConfig_.indexAskPort;
indexAskReceiver_.push_back(new Address(ip, indexAskPort));
int requestAskPort = replicaConfig_.requestAskPort;
requestAskReceiver_.push_back(new Address(ip, requestAskPort));
int masterPort = replicaConfig_.masterPort;
masterReceiver_.push_back(new Address(ip, masterPort));
}
// (Followers:) Create index-sync endpoint to receive indices
port = replicaConfig_.indexSyncPort;
Endpoint* idxSyncEP = CreateEndpoint(endPointType_, ip, port);
// Register a msg handler to this endpoint to handle index sync messages
auto idxHandleFunc = [](MessageHeader* msgHeader, char* msgBuffer,
Address* sender, void* ctx) {
((Replica*)ctx)->ReceiveIndexSyncMessage(msgHeader, msgBuffer);
};
// Register a timer to monitor replica status
Timer* idxSyncMonitorTimer = new Timer(
[](void* ctx, void* receiverEP) {
if (((Replica*)ctx)->status_ != ReplicaStatus::NORMAL) {
((Endpoint*)receiverEP)->LoopBreak();
}
},
monitorPeriodMs, this);
indexSyncContext_ =
new ReceiverContext(idxSyncEP, this, idxHandleFunc, idxSyncMonitorTimer);
LOG(INFO) << "indexSyncContext_ Created";
// Create an endpoint to handle others' requests for missed index
port = replicaConfig_.indexAskPort;
Endpoint* missedIdxEP = CreateEndpoint(endPointType_, ip, port);
// Register message handler
auto missedIdxHandleFunc = [](MessageHeader* msgHeader, char* msgBuffer,
Address* sender, void* ctx) {
((Replica*)ctx)->ReceiveAskMissedIdx(msgHeader, msgBuffer);
};
// Register a timer to monitor replica status
Timer* missedIdxAckMonitorTimer = new Timer(
[](void* ctx, void* receiverEP) {
if (((Replica*)ctx)->status_ != ReplicaStatus::NORMAL) {
((Endpoint*)receiverEP)->LoopBreak();
}
},
monitorPeriodMs, this);
missedIndexAckContext_ = new ReceiverContext(
missedIdxEP, this, missedIdxHandleFunc, missedIdxAckMonitorTimer);
LOG(INFO) << "missedIndexAckContext_ Created";
// Create an endpoint to handle others' requests for missed req
port = replicaConfig_.requestAskPort;
Endpoint* missedReqAckEP = CreateEndpoint(endPointType_, ip, port);
// Register message handler
auto missedReqAckHandleFunc = [](MessageHeader* msgHeader, char* msgBuffer,
Address* sender, void* ctx) {
((Replica*)ctx)->ReceiveAskMissedReq(msgHeader, msgBuffer);
};
// Register a timer to monitor replica status
Timer* missedReqAckMonitorTimer = new Timer(
[](void* ctx, void* receiverEP) {
if (((Replica*)ctx)->status_ != ReplicaStatus::NORMAL) {
((Endpoint*)receiverEP)->LoopBreak();
}
},
monitorPeriodMs, this);
missedReqAckContext_ = new ReceiverContext(
missedReqAckEP, this, missedReqAckHandleFunc, missedReqAckMonitorTimer);
LOG(INFO) << "missedReqAckContext_ Created";
// Create Record Qus and Maps
recordMap_.resize(replicaConfig_.recordShards);
recordQu_.resize(replicaConfig_.recordShards);
// Create track entry for trackThread
trackedEntry_.assign(replicaConfig_.trackShards, maxSyncedLogEntry_);
// Create reply endpoints
int replyShardNum = replicaConfig_.replyShards;
for (int i = 0; i < replyShardNum; i++) {
fastReplySender_.push_back(CreateEndpoint(endPointType_));
slowReplySender_.push_back(CreateEndpoint(endPointType_));
}
// Create reply queues (one queue per fast/slow reply thread)
fastReplyQu_.resize(replyShardNum);
slowReplyQu_.resize(replyShardNum);
// Create CrashVector Context
std::vector cvVec(replicaNum_, 0);
CrashVectorStruct* cv = new CrashVectorStruct(cvVec, 2);
crashVector_.assign(cv->version_, cv);
/** Thw related threads using crash vectors are:
* (1) master (1 thread)
* (2) FastReplyThread(s) (replyShardNum threads) */
crashVectorVecSize_ = 1 + replyShardNum;
crashVectorInUse_ = new std::atomic[crashVectorVecSize_];
for (uint32_t i = 0; i < crashVectorVecSize_; i++) {
crashVectorInUse_[i] = cv;
}
// Create other useful timers
heartbeatCheckTimer_ = new Timer(
[](void* ctx, void* receiverEP) {
// Followers use this timer to check leader's heartbeat
((Replica*)ctx)->CheckHeartBeat();
},
monitorPeriodMs, this);
indexAskTimer_ = new Timer(
[](void* ctx, void* receiverEP) { ((Replica*)ctx)->AskMissedIndex(); },
replicaConfig_.indexAskPeriodMs, this);
roundRobinIndexAskIdx_ = 0;
// Initially, no missed indices, so we make first > second
missedIndices_ = {1, 0};
requestAskTimer_ = new Timer(
[](void* ctx, void* receiverEP) { ((Replica*)ctx)->AskMissedRequest(); },
replicaConfig_.requestAskPeriodMs, this);
roundRobinRequestAskIdx_ = 0;
missedReqKeys_.clear();
viewChangeTimer_ = new Timer(
[](void* ctx, void* receiverEP) {
((Replica*)ctx)->BroadcastViewChange();
},
replicaConfig_.viewChangePeriodMs, this);
roundRobinProcessIdx_ = 0;
periodicSyncTimer_ = new Timer(
[](void* ctx, void* receiverEP) {
((Replica*)ctx)->SendSyncStatusReport();
},
replicaConfig_.syncReportPeriodMs, this);
requestTrasnferBatch_ = replicaConfig_.requestTransferBatch;
indexTransferBatch_ = replicaConfig_.indexTransferBatch;
requestKeyTransferBatch_ = replicaConfig_.requestKeyTransferBatch;
stateTransferTimer_ = new Timer(
[](void* ctx, void* receiverEP) {
((Replica*)ctx)->SendStateTransferRequest();
},
replicaConfig_.stateTransferPeriodMs, this);
stateTransferTimeout_ = replicaConfig_.stateTransferTimeoutMs;
crashVectorRequestTimer_ = new Timer(
[](void* ctx, void* receiverEP) {
((Replica*)ctx)->BroadcastCrashVectorRequest();
},
replicaConfig_.crashVectorRequestPeriodMs, this);
recoveryRequestTimer_ = new Timer(
[](void* ctx, void* receiverEP) {
((Replica*)ctx)->BroadcastRecoveryRequest();
},
replicaConfig_.recoveryRequestPeriodMs, this);
movingPercentile_ = replicaConfig_.movingPercentile;
slidingWindowLen_ = replicaConfig_.owdEstimationWindow;
// Signal variable for garbage collection (of followers)
reclaimTimeout_ = replicaConfig_.reclaimTimeoutMs;
safeToClearUnSyncedLogId_ = new std::atomic[replyShardNum + 1];
safeToClearLateBufferLogId_ = CONCURRENT_MAP_START_INDEX - 1;
for (int i = 0; i <= replyShardNum; i++) {
safeToClearUnSyncedLogId_[i] = CONCURRENT_MAP_START_INDEX - 1;
}
prepareToClearLateBufferLogId_ = CONCURRENT_MAP_START_INDEX - 1;
prepareToClearUnSyncedLogId_ = CONCURRENT_MAP_START_INDEX - 1;
}
void Replica::ResetContext() {
// Clear queues
for (uint32_t i = 0; i < fastReplyQu_.size(); i++) {
LogEntry* entry;
while (fastReplyQu_[i].try_dequeue(entry)) {
}
while (slowReplyQu_[i].try_dequeue(entry)) {
}
// Don't worry about memory leakage, the memory pointed by these in-queue
// pointers have already been cleaned or will be cleaned according to their
// Conucurrent maps
}
LogEntry* entry;
while (processQu_.try_dequeue(entry)) {
delete entry;
}
for (uint32_t i = 0; i < recordQu_.size(); i++) {
RequestBody* rb;
while (recordQu_[i].try_dequeue(rb)) {
delete rb;
}
}
// TODO: Clear LateBuffer
// Clear Early Buffer
while (earlyBuffer_.empty() == false) {
LogEntry* entry = earlyBuffer_.begin()->second;
delete entry;
earlyBuffer_.erase(earlyBuffer_.begin());
}
// Reset lastReleasedEntryByKeys_, no need to care about UnSyncedLogs, because
// they are all cleared
for (uint32_t key = 0; key < keyNum_; key++) {
if (maxSyncedLogEntryByKey_[key]) {
lastReleasedEntryByKeys_[key] = {
maxSyncedLogEntryByKey_[key]->body.deadline,
maxSyncedLogEntryByKey_[key]->body.reqKey};
} else {
lastReleasedEntryByKeys_[key] = {0ul, 0ul};
}
}
// Clear UnSyncedLogs
minUnSyncedLogEntry_ = unSyncedLogEntryHead_;
maxUnSyncedLogEntry_ = unSyncedLogEntryHead_;
minUnSyncedLogEntryByKey_.clear();
maxUnSyncedLogEntryByKey_.clear();
minUnSyncedLogEntryByKey_.assign(keyNum_, NULL);
maxUnSyncedLogEntryByKey_.assign(keyNum_, NULL);
// Reset Index-Sync related stuff
roundRobinIndexAskIdx_ = 0;
missedIndices_ = {1, 0};
roundRobinRequestAskIdx_ = 0;
missedReqKeys_.clear();
roundRobinProcessIdx_ = 0;
pendingIndexSync_.clear();
// Reset stateTransfer related stuff
stateTransferIndices_.clear();
viewChangeSet_.clear();
crashVectorReplySet_.clear();
recoveryReplySet_.clear();
syncStatusSet_.clear();
// Reset trackedEntry
trackedEntry_.assign(trackedEntry_.size(), maxSyncedLogEntry_);
// Reset OWD-Calc Related stuff
slidingWindow_.clear();
owdSampleNum_.clear();
// Reset Master's timers
// No need to worry about other timers: worker thread will unregister their
// timers and msg handlers during LoopBreak
masterContext_->endPoint_->UnRegisterAllTimers();
masterContext_->endPoint_->RegisterTimer(masterContext_->monitorTimer_);
if (!AmLeader()) {
// Start checking leader's heartbeat from now on
lastHeartBeatTime_ = GetMicrosecondTimestamp();
masterContext_->endPoint_->RegisterTimer(heartbeatCheckTimer_);
}
masterContext_->endPoint_->RegisterTimer(periodicSyncTimer_);
// Reset signal variable for garbage collection (of followers)
safeToClearLateBufferLogId_ = CONCURRENT_MAP_START_INDEX - 1;
for (uint32_t i = 0; i <= fastReplyQu_.size(); i++) {
// The number of such counters is number of FastReplyThread_ + 1 (IndexRecv)
safeToClearUnSyncedLogId_[i] = CONCURRENT_MAP_START_INDEX - 1;
}
prepareToClearLateBufferLogId_ = CONCURRENT_MAP_START_INDEX - 1;
prepareToClearUnSyncedLogId_ = CONCURRENT_MAP_START_INDEX - 1;
}
void Replica::LaunchThreads() {
activeWorkerNum_ = 0; // Dynamic variable, used as semaphore
totalWorkerNum_ = 0; // Static variable to count number of workers
// RequestReceive
for (int i = 0; i < replicaConfig_.receiverShards; i++) {
totalWorkerNum_++;
std::thread* td = new std::thread(&Replica::ReceiveThread, this, i);
std::string key("ReceiveThread-" + std::to_string(i));
threadPool_[key] = td;
LOG(INFO) << "Launched " << key << "\t" << td->native_handle();
}
// RequestRecord
for (int i = 0; i < replicaConfig_.recordShards; i++) {
totalWorkerNum_++;
std::thread* td = new std::thread(&Replica::RecordThread, this, i);
std::string key("RecordThread-" + std::to_string(i));
threadPool_[key] = td;
LOG(INFO) << "Launched " << key << "\t" << td->native_handle();
}
// RequestProcess
if (replicaConfig_.processShards != 1) {
LOG(ERROR) << "ProcessThread parallelization is not supported. "
"replicaConfig_->processShards must be 1.";
exit(1);
}
for (int i = 0; i < replicaConfig_.processShards; i++) {
totalWorkerNum_++;
std::thread* td = new std::thread(&Replica::ProcessThread, this, i);
std::string key("ProcessThread-" + std::to_string(i));
threadPool_[key] = td;
LOG(INFO) << "Launched " << key << "\t" << td->native_handle();
}
// RequestReply
int replyShardNum = replicaConfig_.replyShards;
for (int i = 0; i < replyShardNum; i++) {
totalWorkerNum_++;
std::thread* td =
new std::thread(&Replica::FastReplyThread, this, i, i + 1);
std::string key("FastReplyThread-" + std::to_string(i));
threadPool_[key] = td;
LOG(INFO) << "Launched " << key << "\t" << td->native_handle();
}
for (int i = 0; i < replyShardNum; i++) {
totalWorkerNum_++;
std::thread* td = new std::thread(&Replica::SlowReplyThread, this, i);
std::string key("SlowReplyThread-" + std::to_string(i));
threadPool_[key] = td;
LOG(INFO) << "Launched " << key << "\t" << td->native_handle();
}
// Track
for (int i = 0; i < replicaConfig_.trackShards; i++) {
totalWorkerNum_++;
std::thread* td = new std::thread(&Replica::TrackThread, this, i);
std::string key("TrackThread-" + std::to_string(i));
threadPool_[key] = td;
LOG(INFO) << "Launched " << key << "\t" << td->native_handle();
}
// IndexSync
for (int i = 0; i < replicaConfig_.indexSyncShards; i++) {
totalWorkerNum_++;
std::thread* td = new std::thread(&Replica::IndexSendThread, this, i,
i + replyShardNum + 1);
std::string key("IndexSendThread-" + std::to_string(i));
threadPool_[key] = td;
LOG(INFO) << "Launched " << key << "\t" << td->native_handle();
if (!AmLeader()) {
// follower only needs one sync thread
break;
}
}
totalWorkerNum_++;
threadPool_["IndexRecvThread"] =
new std::thread(&Replica::IndexRecvThread, this);
LOG(INFO) << "Launched IndexRecvThread\t"
<< threadPool_["IndexRecvThread"]->native_handle();
totalWorkerNum_++;
threadPool_["IndexProcessThread"] =
new std::thread(&Replica::IndexProcessThread, this);
LOG(INFO) << "Launched IndexProcessThread\t"
<< threadPool_["IndexProcessThread"]->native_handle();
totalWorkerNum_++;
threadPool_["MissedIndexAckThread"] =
new std::thread(&Replica::MissedIndexAckThread, this);
LOG(INFO) << "Launched MissedIndexAckThread\t"
<< threadPool_["MissedIndexAckThread"]->native_handle();
totalWorkerNum_++;
threadPool_["MissedReqAckThread"] =
new std::thread(&Replica::MissedReqAckThread, this);
LOG(INFO) << "Launched MissedReqAckThread\t"
<< threadPool_["MissedReqAckThread"]->native_handle();
// totalWorkerNum_++;
// threadPool_["GarbageCollectThread"] =
// new std::thread(&Replica::GarbageCollectThread, this);
// LOG(INFO) << "Launch GarbageCollectThread "
// << threadPool_["GarbageCollectThread"]->native_handle();
totalWorkerNum_++;
threadPool_["OWDCalcThread"] = new std::thread(&Replica::OWDCalcThread, this);
LOG(INFO) << "Launch OWDCalcThread "
<< threadPool_["OWDCalcThread"]->native_handle();
// totalWorkerNum_++;
// threadPool_["LogHash"] = new std::thread(&Replica::LogHash, this);
// LOG(INFO) << "Launched IndexRecvThread\t"
// << threadPool_["LogHash"]->native_handle();
LOG(INFO) << "Master Thread " << pthread_self();
LOG(INFO) << "totalWorkerNum_=" << totalWorkerNum_;
}
void Replica::ReceiveClientRequest(MessageHeader* msgHdr, char* msgBuffer,
Address* sender) {
if (msgHdr->msgType == MessageType::CLIENT_REQUEST) {
Request request;
if (request.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
// tagQu_.enqueue(request.tagid());
// Collect OWD sample
uint64_t recvTime = GetMicrosecondTimestamp();
if (recvTime > request.sendtime()) {
owdQu_.enqueue(std::pair(
request.proxyid(), GetMicrosecondTimestamp() - request.sendtime()));
}
if (proxyAddressMap_.get(request.proxyid()) == 0) {
Address* addr = new Address(*sender);
/** When one proxy sends the request, it needs to specify a proper
**unique* proxyid related to one specific receiver thread on the
*replica, so that this replica's different receiver threads will not
*insert the same entry concurrently (otherwise, it may cause memory
*leakage)
*
* In our proxy Implemention, each proxy machine has a unique id,
with multiple shard. The machine-id concats shard-id becomes a unqiue
*proxy-id, modulo replica-shard-num and then send to the replica
*receiver
**/
proxyAddressMap_.assign(request.proxyid(), addr);
}
uint64_t reqKey = CONCAT_UINT32(request.clientid(), request.reqid());
uint64_t deadline = request.sendtime() + request.bound();
RequestBody* rb =
new RequestBody(deadline, reqKey, request.key(), request.proxyid(),
request.command(), request.iswrite());
uint32_t quId = (reqKey) % recordQu_.size();
recordQu_[quId].enqueue(rb);
} else {
LOG(WARNING) << "Parse request fail";
}
} else {
LOG(WARNING) << "Invalid Message Type " << (uint32_t)(msgHdr->msgType);
}
}
void Replica::BlockWhenStatusIsNot(char targetStatus) {
if (status_ != targetStatus) {
activeWorkerNum_.fetch_sub(1);
std::unique_lock lk(waitMutext_);
waitVar_.wait(lk, [this, targetStatus] {
if (status_ == ReplicaStatus::TERMINATED || status_ == targetStatus) {
// Unblock
activeWorkerNum_.fetch_add(1);
return true;
} else {
return false;
}
});
}
}
void Replica::OWDCalcThread() {
activeWorkerNum_.fetch_add(1);
std::pair owdSample;
// uint32_t logCnt = 0;
while (status_ != ReplicaStatus::TERMINATED) {
BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
while (owdQu_.try_dequeue(owdSample)) {
uint64_t proxyId = owdSample.first;
uint32_t owd = owdSample.second;
owdSampleNum_[proxyId]++;
if (slidingWindow_[proxyId].size() < slidingWindowLen_) {
slidingWindow_[proxyId].push_back(owd);
} else {
slidingWindow_[proxyId][owdSampleNum_[proxyId] % slidingWindowLen_] =
owd;
}
if (owdSampleNum_[proxyId] >= slidingWindowLen_) {
std::vector tmpSamples(slidingWindow_[proxyId]);
sort(tmpSamples.begin(), tmpSamples.end());
uint32_t movingEstimate =
tmpSamples[slidingWindowLen_ * movingPercentile_];
owdMap_.assign(proxyId, movingEstimate);
}
}
// reduce CPU cost
nanosleep((const struct timespec[]){{0, 1000000L}}, NULL);
}
uint32_t preVal = activeWorkerNum_.fetch_sub(1);
LOG(INFO) << "OWDCalcThread Terminated: " << preVal - 1
<< " worker remaining";
}
void Replica::ReceiveThread(int id) {
activeWorkerNum_.fetch_add(1);
while (status_ != ReplicaStatus::TERMINATED) {
BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
requestContext_[id]->Register(endPointType_);
requestContext_[id]->endPoint_->LoopRun();
}
uint32_t preVal = activeWorkerNum_.fetch_sub(1);
LOG(INFO) << "ReceiveThread Terminated:" << preVal - 1 << " worker remaining";
}
void Replica::RecordThread(int id) {
activeWorkerNum_.fetch_add(1);
RequestBody* rb;
// uint64_t sta, ed, cnt;
// cnt = 0;
while (status_ != ReplicaStatus::TERMINATED) {
BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
if (recordQu_[id].try_dequeue(rb)) {
// cnt++;
// if (cnt == 1) {
// sta = GetMicrosecondTimestamp();
// }
// if (cnt % 100000 == 0) {
// ed = GetMicrosecondTimestamp();
// float rate = 100000.0 / ((ed - sta) * 1e-6);
// sta = ed;
// LOG(INFO) << "id=" << id << " record rate = " << rate << "\t"
// << "recordQuLen=" << recordQu_[id].size_approx() << "\t"
// << "processQuLen=" << processQu_.size_approx() << "\t"
// << "gap sample =" << ed - rb->deadline
// << " \t deadline=" << rb->deadline;
// }
/** The map is sharded by reqKey */
LogEntry* duplicate = recordMap_[id].get(rb->reqKey);
if (duplicate == NULL) {
SHA_HASH dummy;
LogEntry* newEntry = new LogEntry(*rb, dummy, dummy);
recordMap_[id].assign(rb->reqKey, newEntry);
processQu_.enqueue(newEntry);
} else {
// Duplicate requests
processQu_.enqueue(duplicate);
}
delete rb;
}
}
uint32_t preVal = activeWorkerNum_.fetch_sub(1);
LOG(INFO) << "RecordThread-" << id << " Terminated: " << preVal - 1
<< " worker remaining";
}
void Replica::TrackThread(int id) {
activeWorkerNum_.fetch_add(1);
while (status_ != ReplicaStatus::TERMINATED) {
BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
if (trackedEntry_[id]->next) {
LogEntry* next = trackedEntry_[id]->next;
// LOG(INFO) << "next logId = " << next->logId;
if (next->logId % trackedEntry_.size() == (uint32_t)id) {
if (trackedEntry_[id]->logId >= CONCURRENT_MAP_START_INDEX) {
uint32_t a = trackedEntry_[id]->logId;
uint32_t b = next->logId;
if (a + trackedEntry_.size() != b) {
LOG(ERROR) << "myId = " << trackedEntry_[id]->logId << "\t"
<< "sz = " << trackedEntry_.size() << "\t"
<< "next=" << next->logId << "\t"
<< trackedEntry_[id]->logId + trackedEntry_.size()
<< "\t"
<< (trackedEntry_[id]->logId + trackedEntry_.size() !=
next->logId)
<< "\t"
<< "a=" << a << "\t"
<< "b=" << b;
}
ASSERT(trackedEntry_[id]->logId + trackedEntry_.size() ==
next->logId);
}
syncedLogEntryByLogId_.assign(next->logId, next);
syncedLogEntryByReqKey_.assign(next->body.reqKey, next);
}
trackedEntry_[id] = next;
}
if (status_ == ReplicaStatus::TERMINATED) {
LOG(INFO) << "Track Thread terminate ";
}
}
uint32_t preVal = activeWorkerNum_.fetch_sub(1);
LOG(INFO) << "TrackThread-" << id << " Terminated: " << preVal - 1
<< " worker remaining";
}
void Replica::ProcessThread(int id) {
activeWorkerNum_.fetch_add(1);
LogEntry* entry;
std::set tags;
while (status_ != ReplicaStatus::TERMINATED) {
BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
bool amLeader = AmLeader();
if (processQu_.try_dequeue(entry)) {
if (entry->status == EntryStatus::INITIAL) {
std::pair earlyBufferRank(entry->body.deadline,
entry->body.reqKey);
if (earlyBufferRank > lastReleasedEntryByKeys_[entry->body.opKey]) {
earlyBuffer_[earlyBufferRank] = entry;
entry->status = EntryStatus::IN_PROCESS;
} else {
// LOG(INFO) <<"Abnormal "<body.opKey
// <<"\t<"<\t"
// <<"\t<"<body.opKey].first
// <<","<body.opKey].second
// <<">";
// This entry cannot enter early buffer
if (amLeader) {
// Leader modifies its deadline
entry->body.deadline =
lastReleasedEntryByKeys_[entry->body.opKey].first + 1;
earlyBufferRank.first = entry->body.deadline;
earlyBuffer_[earlyBufferRank] = entry;
entry->status = EntryStatus::IN_PROCESS;
} else {
// Followers leave it in late buffer
entry->status = EntryStatus::IN_LATEBUFFER;
}
}
} else if (entry->status == EntryStatus::IN_PROCESS ||
entry->status == EntryStatus::IN_LATEBUFFER) {
continue;
} else if (entry->status == EntryStatus::PROCESSED) {
uint32_t quId = (entry->body.reqKey) % fastReplyQu_.size();
fastReplyQu_[quId].enqueue(entry);
} else if (entry->status == EntryStatus::TO_SLOW_REPLY) {
uint32_t quId = (entry->body.reqKey) % slowReplyQu_.size();
slowReplyQu_[quId].enqueue(entry);
} else {
LOG(WARNING) << "Unexpected Entry Status " << (int)(entry->status);
}
}
// Polling early-buffer
uint64_t nowTime = GetMicrosecondTimestamp();
// This while loop is safe because there is only one processThread.
// Parallelization of this thread is not supported.
while (!earlyBuffer_.empty()) {
LogEntry* nextEntry = earlyBuffer_.begin()->second;
if (nowTime < nextEntry->body.deadline) {
break;
}
if (nextEntry->body.isWrite) {
lastReleasedEntryByKeys_[nextEntry->body.opKey] =
earlyBuffer_.begin()->first;
}
ProcessRequest(nextEntry, amLeader, true, amLeader);
earlyBuffer_.erase(earlyBuffer_.begin());
}
}
uint32_t preVal = activeWorkerNum_.fetch_sub(1);
LOG(INFO) << "ProcessThread Terminated: " << preVal - 1
<< " worker remaining";
}
void Replica::ProcessRequest(LogEntry* entry, const bool isSyncedReq,
const bool sendReply, const bool canExecute) {
RequestBody& rb = entry->body;
// Read Request do not contribute to hash
entry->logHash = entry->entryHash =
rb.isWrite ? CalculateHash(rb.deadline, rb.reqKey) : SHA_HASH();
std::vector& maxEntryByKey =
isSyncedReq ? maxSyncedLogEntryByKey_ : maxUnSyncedLogEntryByKey_;
std::atomic& maxEntry =
isSyncedReq ? maxSyncedLogEntry_ : maxUnSyncedLogEntry_;
// The log id of the previous non-commutative entry in the synced logs
entry->prevNonCommutative = maxEntryByKey[rb.opKey];
if (entry->prevNonCommutative) {
if (entry->prevNonCommutative->body.isWrite) {
entry->prevNonCommutativeWrite = entry->prevNonCommutative;
} else {
entry->prevNonCommutativeWrite =
entry->prevNonCommutative->prevNonCommutativeWrite;
}
}
entry->prev = maxEntry;
entry->result = (isSyncedReq && canExecute) ? ApplicationExecute(rb) : "";
if (entry->prevNonCommutativeWrite) {
entry->logHash.XOR(entry->prevNonCommutativeWrite->logHash);
}
ASSERT(entry->prev != NULL);
entry->logId = entry->prev->logId + 1;
entry->status = EntryStatus::PROCESSED;
if (entry->prevNonCommutative) {
entry->prevNonCommutative->nextNonCommutative = entry;
}
if (entry->prevNonCommutativeWrite && rb.isWrite) {
entry->prevNonCommutativeWrite->nextNonCommutativeWrite = entry;
}
if (isSyncedReq == false && minUnSyncedLogEntryByKey_[rb.opKey] == NULL) {
minUnSyncedLogEntryByKey_[rb.opKey] = entry;
}
entry->prev->next = entry;
maxEntryByKey[rb.opKey] = entry;
maxEntry = entry;
if (sendReply) {
uint32_t quId = (entry->body.reqKey) % fastReplyQu_.size();
fastReplyQu_[quId].enqueue(entry);
}
}
void Replica::FastReplyThread(int id, int cvId) {
activeWorkerNum_.fetch_add(1);
Reply reply;
reply.set_replytype(MessageType::FAST_REPLY);
reply.set_replicaid(replicaId_);
CrashVectorStruct* cv = crashVectorInUse_[cvId];
uint32_t replyNum = 0;
// uint64_t startTime, endTime;
while (status_ != ReplicaStatus::TERMINATED) {
BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
bool amLeader = AmLeader();
safeToClearUnSyncedLogId_[id].store(prepareToClearUnSyncedLogId_.load());
// Before encoding crashVector into hash, check whether the crashVector
// (cv) is the freshest one
CrashVectorStruct* masterCV = crashVectorInUse_[0].load();
if (cv->version_ < masterCV->version_) {
// My crash vector is stale, update it
crashVectorInUse_[cvId] = masterCV;
cv = masterCV;
}
LogEntry* entry = NULL;
if (fastReplyQu_[id].try_dequeue(entry)) {
reply.set_iswrite(entry->body.isWrite);
reply.set_opkey(entry->body.opKey);
replyNum++;
// if (replyNum % 500000 == 0) {
// LOG(INFO) << id << "QuLen=" << fastReplyQu_[id].size_approx();
// }
Address* addr = proxyAddressMap_.get(entry->body.proxyId);
if (!addr) {
// The replica cannot find the address to send reply
// This can happen in very trivial edge cases, e.g.,
// Step 1: This replica misses the entry
// Step 2: The other replica gives this replica the missing entry
// Step 3: This replica has not received any entries from that proxy,
// so it does not have any addr info Step 4: This replica wants to
// send reply for this entry
LOG(ERROR) << "Cannot find the address of the proxy "
<< HIGH_32BIT(entry->body.proxyId) << "-"
<< LOW_32BIT(entry->body.proxyId);
continue;
}
reply.set_view(viewId_);
reply.set_clientid(HIGH_32BIT(entry->body.reqKey));
reply.set_reqid(LOW_32BIT(entry->body.reqKey));
reply.set_result(entry->result);
// If the owdMap_ does not have the proxyId (i.e. the owd for this
// proxyId has not been estimated), it will return 0 (0 happens to be
// the dummy value of protobuf, and the proxy will not consider it as an
// estimated owd)
reply.set_owd(owdMap_.get(entry->body.proxyId));
SHA_HASH hash(entry->logHash);
hash.XOR(cv->cvHash_);
if (amLeader) {
// Leader's logic is very easy: after XORing the crashVector and the
// log entry hash together, it can directly reply
reply.set_hash(hash.hash, SHA_DIGEST_LENGTH);
reply.set_logid(entry->logId);
reply.set_maxsyncedlogid(maxSyncedLogEntry_.load()->logId);
uint32_t proxyMachineId = HIGH_32BIT(entry->body.proxyId);
repliedSyncPoint_[proxyMachineId] = reply.maxsyncedlogid();
fastReplySender_[id]->SendMsgTo(*addr, reply, MessageType::FAST_REPLY);
// replyLogQu_.enqueue(reply);
// LOG(INFO) << "Leader reply=" << reply.reqid() << "\t"
// << "opKey=" << entry->opKey << "\t"
// << "hash=" << hash.toString();
} else {
// But follower's hash is a bit complicated, because it needs to
// consider both synced entries and unsynced entries, i.e. We need to
// (1) eliminate the part to the left of sync-point and (2) use the
// remaining part (to the right of sync-point) to XOR the part that
// has already been synced
// Let's first get the boundary, i.e. minUnSyncedLogId_ and
// maxSyncedLogId_ maxSynced is always updated earlier than
// minUnSynced, so we first get minUnSynced, and then get maxSynced,
// this ensures minUnSynced is no fresher than maxSynced By contrast,
// if we get the two variables in the reverse order, then we cannot be
// sure which variable is fresher, that can lead to the missing of
// some entries during hash calculation
LogEntry* unsyncedEntry = minUnSyncedLogEntryByKey_[entry->body.opKey];
LogEntry* syncedEntry = maxSyncedLogEntryByKey_[entry->body.opKey];
if (syncedEntry && syncedEntry->body.isWrite == false) {
// Only Write matters
syncedEntry = syncedEntry->prevNonCommutativeWrite;
assert(syncedEntry == NULL || syncedEntry->body.isWrite);
}
if (syncedEntry == NULL) {
// The index sync process may have not been started, or may have not
// catch up; Or the unsynced logs have been reclaimed by
// GarbageCollectionThread (we have advanced
// safeToClearUnSyncedLogId_) We cannot decide the sync-point, so
// we directly reply with the XORed hash (similar to the leader)
reply.set_hash(hash.hash, SHA_DIGEST_LENGTH);
reply.set_maxsyncedlogid(maxSyncedLogEntry_.load()->logId);
uint32_t proxyMachineId = HIGH_32BIT(entry->body.proxyId);
repliedSyncPoint_[proxyMachineId] = reply.maxsyncedlogid();
fastReplySender_[id]->SendMsgTo(*addr, reply,
MessageType::FAST_REPLY);
// replyLogQu_.enqueue(reply);
} else {
// The follower already gets some synced non-commutative logs (via
// index sync process)
// Log entries up to syncedEntry are all synced
// syncedEntry->hash represents them
if (entry->LessOrEqual(*syncedEntry)) {
// No need to send fast replies, because this entry has already
// been covered by index sync process, just give it a dummy reply,
// which includes the max-synced-log-id
uint32_t proxyMachineId = HIGH_32BIT(entry->body.proxyId);
if (repliedSyncPoint_[proxyMachineId] <
maxSyncedLogEntry_.load()->logId) {
reply.set_clientid(0);
reply.set_reqid(0);
reply.set_logid(0);
reply.set_maxsyncedlogid(maxSyncedLogEntry_.load()->logId);
fastReplySender_[id]->SendMsgTo(*addr, reply,
MessageType::FAST_REPLY);
}
} else {
// Beyond syncedEntry, we need to find the boundary in the unsynced
// logs
// TODO: Check the following
// Since unsyncedLogId is no fresher (maybe older) than syncedLogId,
// then unsyncedEntry may have already been surpasssed by
// syncedEntry, we need to remove the (potential) overlap
while (unsyncedEntry->LessOrEqual(*syncedEntry)) {
if (unsyncedEntry->body.isWrite) {
if (unsyncedEntry->nextNonCommutative) {
unsyncedEntry = unsyncedEntry->nextNonCommutative;
} else {
break;
}
} else {
if (unsyncedEntry->nextNonCommutative) {
unsyncedEntry = unsyncedEntry->nextNonCommutative;
} else {
break;
}
}
}
// LogStruct log;
// log.originalHash = hash.toString();
// hash encodes all the (unsynced) entries up to entry
hash.XOR(unsyncedEntry->logHash); // Remove all previous hash
// before unsyncedEntry [included]
// log.unsynced = unsyncedEntry;
// log.addback = false;
if (syncedEntry->LessThan(*unsyncedEntry)) {
// add itself back (read request is 0)
hash.XOR(unsyncedEntry->entryHash);
// log.addback = true;
}
// Now hash only encodes [unsyncedEntry, entry]
// Let's add the synced part
// log.synced = syncedEntry;
hash.XOR(syncedEntry->logHash);
// log.finalE = entry;
// entryQu_.enqueue(log);
reply.set_hash(hash.hash, SHA_DIGEST_LENGTH);
reply.set_maxsyncedlogid(maxSyncedLogEntry_.load()->logId);
uint32_t proxyMachineId = HIGH_32BIT(entry->body.proxyId);
repliedSyncPoint_[proxyMachineId] = reply.maxsyncedlogid();
fastReplySender_[id]->SendMsgTo(*addr, reply,
MessageType::FAST_REPLY);
// replyLogQu_.enqueue(reply);
}
}
}
}
}
uint32_t preVal = activeWorkerNum_.fetch_sub(1);
LOG(INFO) << "Fast Reply Terminated " << preVal - 1 << " worker remaining";
}
void Replica::SlowReplyThread(int id) {
activeWorkerNum_.fetch_add(1);
Reply reply;
reply.set_replicaid(replicaId_);
reply.set_hash("");
// uint32_t replyNum = 0;
// uint64_t startTime, endTime;
while (status_ != ReplicaStatus::TERMINATED) {
BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
if (AmLeader()) {
// Leader does not send slow replies
nanosleep((const struct timespec[]){{0, 1000000L}}, NULL);
continue;
}
LogEntry* entry = NULL;
if (slowReplyQu_[id].try_dequeue(entry)) {
uint32_t logId = entry->logId;
reply.set_view(viewId_);
reply.set_clientid((entry->body.reqKey) >> 32);
reply.set_reqid((uint32_t)(entry->body.reqKey));
// Optimize: SLOW_REPLY => COMMIT_REPLY
if (logId <= committedLogId_) {
reply.set_replytype(MessageType::COMMIT_REPLY);
reply.set_result(entry->result);
} else {
reply.set_replytype(MessageType::SLOW_REPLY);
reply.set_result("");
}
reply.set_owd(owdMap_.get(entry->body.proxyId));
reply.set_maxsyncedlogid(maxSyncedLogEntry_.load()->logId);
Address* addr = proxyAddressMap_.get(entry->body.proxyId);
if (addr) {
slowReplySender_[id]->SendMsgTo(*addr, reply, MessageType::SLOW_REPLY);
}
// replyNum++;
// if (replyNum == 1) {
// startTime = GetMicrosecondTimestamp();
// } else if (replyNum % 100000 == 0) {
// endTime = GetMicrosecondTimestamp();
// float rate = 100000 / ((endTime - startTime) * 1e-6);
// LOG(INFO) << "id=" << id << "\t Slow Reply Rate=" << rate
// << "\t QuLen=" << slowReplyQu_[id].size_approx() << "\t"
// << "pendingIndexSync_ qu =" << pendingIndexSync_.size();
// startTime = endTime;
// }
}
}
uint32_t preVal = activeWorkerNum_.fetch_sub(1);
LOG(INFO) << "SlowReplyThread Terminated " << preVal - 1
<< " worker remaining ";
}
void Replica::IndexSendThread(int id, int cvId) {
activeWorkerNum_.fetch_add(1);
LogEntry* lastSyncedEntry = syncedLogEntryHead_;
IndexSync indexSyncMsg;
uint32_t syncPeriod = replicaConfig_.indexSyncPeriodUs;
struct timespec sleepIntval({0, syncPeriod * 1000});
while (status_ != ReplicaStatus::TERMINATED) {
BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
if (!AmLeader()) {
// Although this replica is not leader currently,
// we still keep this thread. When it becomes the leader
// we can immediately use the thread instead of launching extra threads
// (slowly)
nanosleep((const struct timespec[]){{0, 1000000L}}, NULL);
continue;
}
if (maxSyncedLogEntry_ == NULL) {
continue;
}
// (1) Leader has some indices to sync
// (2) There is noting to send, but we still send an indexSync msg every
// 10ms (to serve as leader's heartbeat)
indexSyncMsg.set_view(viewId_);
indexSyncMsg.set_logidbegin(lastSyncedEntry->logId + 1);
uint32_t logEnd = maxSyncedLogEntry_.load()->logId;
logEnd = std::min(indexSyncMsg.logidbegin() + indexTransferBatch_, logEnd);
indexSyncMsg.set_logidend(logEnd);
indexSyncMsg.clear_deadlines();
indexSyncMsg.clear_reqkeys();
for (uint32_t i = indexSyncMsg.logidbegin(); i <= indexSyncMsg.logidend();
i++) {
LogEntry* entry = lastSyncedEntry->next;
ASSERT(entry != NULL);
ASSERT(entry->logId == i);
indexSyncMsg.add_deadlines(entry->body.deadline);
indexSyncMsg.add_reqkeys(entry->body.reqKey);
lastSyncedEntry = entry;
}
indexSyncMsg.set_sendtime(GetMicrosecondTimestamp());
// Send to all followers
for (uint32_t r = 0; r < replicaNum_; r++) {
if (r != replicaId_) {
indexSender_[id]->SendMsgTo(*(indexReceiver_[r]), indexSyncMsg,
MessageType::SYNC_INDEX);
}
}
nanosleep(&sleepIntval, NULL);
}
uint32_t preVal = activeWorkerNum_.fetch_sub(1);
LOG(INFO) << "IndexSendThread Terminated " << preVal - 1
<< " worker remaining";
}
void Replica::IndexRecvThread() {
activeWorkerNum_.fetch_add(1);
while (status_ != ReplicaStatus::TERMINATED) {
BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
indexSyncContext_->Register(endPointType_);
indexSyncContext_->endPoint_->LoopRun();
}
uint32_t preVal = activeWorkerNum_.fetch_sub(1);
LOG(INFO) << "IndexRecvThread Terminated " << preVal - 1
<< " worker remaining";
}
void Replica::ReceiveIndexSyncMessage(MessageHeader* msgHdr, char* msgBuffer) {
// Promise to the GarbageCollectThread, that I will not use the data before
// safeToClearLateBufferLogId_ and safeToClearUnSyncedLogId_, so that
// GarbageCollectThread can safely reclaim them
safeToClearLateBufferLogId_.store(prepareToClearLateBufferLogId_.load());
safeToClearUnSyncedLogId_[fastReplyQu_.size()].store(
prepareToClearUnSyncedLogId_.load());
MessageHeader* newMsgHdr = new MessageHeader(msgHdr->msgType, msgHdr->msgLen);
char* newBuffer = new char[msgHdr->msgLen];
memcpy(newBuffer, msgBuffer, msgHdr->msgLen);
indexQu_.enqueue({newMsgHdr, newBuffer});
}
void Replica::IndexProcessThread() {
activeWorkerNum_.fetch_add(1);
std::pair ele;
while (status_ != ReplicaStatus::TERMINATED) {
BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
while (indexQu_.try_dequeue(ele)) {
MessageHeader* msgHdr = ele.first;
char* msgBuffer = ele.second;
if (msgHdr->msgType == MessageType::SYNC_INDEX) {
IndexSync idxSyncMsg;
if (idxSyncMsg.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
if (!CheckView(idxSyncMsg.view(), false)) {
delete msgHdr;
delete[] msgBuffer;
break;
}
lastHeartBeatTime_ = GetMicrosecondTimestamp();
if (idxSyncMsg.logidbegin() > idxSyncMsg.logidend()) {
// Pure heart beat
continue;
}
if (idxSyncMsg.logidend() > maxSyncedLogEntry_.load()->logId) {
std::pair key(idxSyncMsg.logidbegin(),
idxSyncMsg.logidend());
pendingIndexSync_[key] = idxSyncMsg;
}
// Process pendingIndexSync, if any
while (!pendingIndexSync_.empty()) {
if (ProcessIndexSync(pendingIndexSync_.begin()->second)) {
pendingIndexSync_.erase(pendingIndexSync_.begin());
} else {
break;
}
}
}
} else if (msgHdr->msgType == MessageType::MISSED_REQ) {
MissedReq missedReqMsg;
if (missedReqMsg.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
for (int i = 0; i < missedReqMsg.reqs().size(); i++) {
const RequestBodyMsg& rbMsg = missedReqMsg.reqs(i);
if (missedReqKeys_.find(rbMsg.reqkey()) != missedReqKeys_.end()) {
RequestBody* rb = new RequestBody(
rbMsg.deadline(), rbMsg.reqkey(), rbMsg.key(),
rbMsg.proxyid(), rbMsg.command(), rbMsg.iswrite());
// We must handle it to ProcessThread instead of processing it
// here, to avoid data race (and further memroy leakage), although
// it is a trivial possibility
uint32_t quId = rbMsg.reqkey() % recordQu_.size();
recordQu_[quId].enqueue(rb);
missedReqKeys_.erase(rbMsg.reqkey());
fetchTime_.push_back(GetMicrosecondTimestamp() -
askTimebyReqKey_[rbMsg.reqkey()]);
askTimebyReqKey_.erase(rbMsg.reqkey());
}
}
}
} else {
LOG(WARNING) << "Unexpected msg type " << (int)(msgHdr->msgType);
}
delete msgHdr;
delete[] msgBuffer;
}
}
uint32_t preVal = activeWorkerNum_.fetch_sub(1);
LOG(INFO) << "IndexProcessThread Terminated: " << preVal - 1
<< " worker remaining";
}
bool Replica::ProcessIndexSync(const IndexSync& idxSyncMsg) {
uint32_t maxSyncedLogId = maxSyncedLogEntry_.load()->logId;
if (idxSyncMsg.logidend() <= maxSyncedLogId) {
// This idxSyncMsg is useless
return true;
}
if (idxSyncMsg.logidbegin() > maxSyncedLogId + 1) {
// Missing some indices
missedIndices_ = {maxSyncedLogId + 1, idxSyncMsg.logidbegin() - 1};
AskMissedIndex();
return false;
}
// Coming here means, no index is missing
if (indexSyncContext_->endPoint_->isTimerRegistered(indexAskTimer_)) {
indexSyncContext_->endPoint_->UnRegisterTimer(indexAskTimer_);
}
for (uint32_t logId = maxSyncedLogId + 1; logId <= idxSyncMsg.logidend();
logId++) {
uint32_t offset = logId - idxSyncMsg.logidbegin();
uint64_t reqKey = idxSyncMsg.reqkeys(offset);
uint64_t deadline = idxSyncMsg.deadlines(offset);
uint32_t quId = reqKey % recordMap_.size();
LogEntry* entry = recordMap_[quId].get(reqKey);
if (entry && missedReqKeys_.empty()) {
SHA_HASH myHash;
SHA_HASH hash;
if (entry->body.isWrite) {
myHash = CalculateHash(deadline, reqKey);
hash = myHash;
}
LogEntry* prevNonCommutative = maxSyncedLogEntryByKey_[entry->body.opKey];
LogEntry* prevNonCommutativeWrite = NULL;
if (prevNonCommutative) {
if (prevNonCommutative->body.isWrite) {
prevNonCommutativeWrite = prevNonCommutative;
} else {
prevNonCommutativeWrite = prevNonCommutative->prevNonCommutativeWrite;
}
}
assert(prevNonCommutativeWrite == NULL ||
prevNonCommutativeWrite->body.isWrite);
if (prevNonCommutativeWrite) {
// This request has some pre non-commutative ones
// In that way, XOR the previous accumulated hash
hash.XOR(prevNonCommutativeWrite->logHash);
}
LogEntry* newEntry =
new LogEntry(entry->body, myHash, hash, prevNonCommutative, NULL,
prevNonCommutativeWrite, NULL, maxSyncedLogEntry_, NULL);
newEntry->status = EntryStatus::TO_SLOW_REPLY;
newEntry->logId = logId;
ASSERT(logId == maxSyncedLogEntry_.load()->logId + 1);
maxSyncedLogEntry_.load()->next = newEntry;
if (prevNonCommutative) {
prevNonCommutative->nextNonCommutative = newEntry;
}
if (newEntry->body.isWrite && prevNonCommutativeWrite) {
prevNonCommutativeWrite->nextNonCommutativeWrite = newEntry;
}
// uint32_t prevMaxLogId = maxSyncedLogEntry_.load()->logId;
maxSyncedLogEntry_ = newEntry;
ASSERT(maxSyncedLogEntry_.load()->logId == logId);
ASSERT(prevMaxLogId + 1 == logId);
maxSyncedLogEntryByKey_[newEntry->body.opKey] = newEntry;
uint32_t quId = (newEntry->body.reqKey) % slowReplyQu_.size();
slowReplyQu_[quId].enqueue(newEntry);
ASSERT(newEntry->prev->logId + 1 == newEntry->logId);
// TODO: Think about the order above
// Chunk UnSynced logs
if (minUnSyncedLogEntryByKey_[newEntry->body.opKey]) {
// Try to advance minUnSyncedLogIdByKey_[opKey]
LogEntry* unSyncedEntry =
minUnSyncedLogEntryByKey_[newEntry->body.opKey];
while (unSyncedEntry->LessOrEqual(*entry)) {
if (unSyncedEntry->body.isWrite) {
if (unSyncedEntry->nextNonCommutativeWrite) {
unSyncedEntry = unSyncedEntry->nextNonCommutativeWrite;
} else {
break;
}
} else {
if (unSyncedEntry->nextNonCommutative) {
unSyncedEntry = unSyncedEntry->nextNonCommutative;
} else {
break;
}
}
}
minUnSyncedLogEntryByKey_[newEntry->body.opKey] = unSyncedEntry;
}
} else {
missedReqKeys_.insert(reqKey);
}
}
if (missedReqKeys_.empty()) {
return true;
} else {
AskMissedRequest();
return false;
}
}
void Replica::MissedIndexAckThread() {
activeWorkerNum_.fetch_add(1);
while (status_ != ReplicaStatus::TERMINATED) {
BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
missedIndexAckContext_->Register(endPointType_);
missedIndexAckContext_->endPoint_->LoopRun();
}
uint32_t preVal = activeWorkerNum_.fetch_sub(1);
LOG(INFO) << "MissedIndexAckThread Terminated " << preVal - 1
<< " worker remaining";
}
void Replica::ReceiveAskMissedIdx(MessageHeader* msgHdr, char* msgBuffer) {
AskIndex askIndex;
if (msgHdr->msgType == MessageType::MISSED_INDEX_ASK &&
askIndex.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
uint32_t logBegin = askIndex.logidbegin();
uint32_t logEnd =
std::min(maxSyncedLogEntry_.load()->logId, askIndex.logidend());
for (uint32_t i = logBegin; i <= logEnd; i += indexTransferBatch_) {
IndexSync indexSyncMsg;
indexSyncMsg.set_view(viewId_);
indexSyncMsg.set_logidbegin(i);
uint32_t end = std::min(i + indexTransferBatch_ - 1, logEnd);
indexSyncMsg.set_logidend(end);
uint32_t logid = i;
LogEntry* entryStart = syncedLogEntryByLogId_.get(logid);
if (!entryStart) {
// Since the update of syncedLogEntryByLogId_ may lag a bit behind
// maxSyncedLogEntry_. entryStart may be NULL. In that case, we
// terminate here
break;
}
ASSERT(entryStart->logId == logid);
while (entryStart->logId <= end) {
indexSyncMsg.add_deadlines(entryStart->body.deadline);
indexSyncMsg.add_reqkeys(entryStart->body.reqKey);
entryStart = entryStart->next;
}
indexAcker_->SendMsgTo(*(indexReceiver_[askIndex.replicaid()]),
indexSyncMsg, MessageType::SYNC_INDEX);
}
}
}
void Replica::MissedReqAckThread() {
activeWorkerNum_.fetch_add(1);
while (status_ != ReplicaStatus::TERMINATED) {
BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
missedReqAckContext_->Register(endPointType_);
missedReqAckContext_->endPoint_->LoopRun();
}
uint32_t preVal = activeWorkerNum_.fetch_sub(1);
LOG(INFO) << "MissedReqAckThread Terminated " << preVal - 1
<< " worker remaining";
}
void Replica::ReceiveAskMissedReq(MessageHeader* msgHdr, char* msgBuffer) {
AskReq askReqMsg;
if (msgHdr->msgType == MessageType::MISSED_REQ_ASK &&
askReqMsg.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
MissedReq missedReqMsg;
missedReqMsg.set_replicaid(this->replicaId_);
for (int i = 0; i < askReqMsg.missedreqkeys_size(); i++) {
uint64_t reqKey = askReqMsg.missedreqkeys(i);
uint32_t quId = reqKey % recordMap_.size();
LogEntry* entry = recordMap_[quId].get(reqKey);
if (entry) {
RequestBodyToMessage(entry->body, missedReqMsg.add_reqs());
}
if ((uint32_t)(missedReqMsg.reqs_size()) >= requestTrasnferBatch_) {
missedReqAckContext_->endPoint_->SendMsgTo(
*(indexReceiver_[askReqMsg.replicaid()]), missedReqMsg,
MessageType::MISSED_REQ);
missedReqMsg.clear_reqs();
}
}
if (missedReqMsg.reqs_size() > 0) {
// This ack is useful because it really contains some missed requests,
// so send it
missedReqAckContext_->endPoint_->SendMsgTo(
*(indexReceiver_[askReqMsg.replicaid()]), missedReqMsg,
MessageType::MISSED_REQ);
}
}
}
void Replica::RequestBodyToMessage(const RequestBody& rb,
RequestBodyMsg* rbMsg) {
rbMsg->set_deadline(rb.deadline);
rbMsg->set_reqkey(rb.reqKey);
rbMsg->set_proxyid(rb.proxyId);
rbMsg->set_command(rb.command);
rbMsg->set_key(rb.opKey);
rbMsg->set_iswrite(rb.isWrite);
}
void Replica::AskMissedIndex() {
if (missedIndices_.first > missedIndices_.second) {
// indexSyncContext_->endPoint_->UnRegisterTimer(indexAskTimer_);
return;
}
uint64_t nowTime = GetMicrosecondTimestamp();
if (lastAskMissedIndexTime_ + 50 > nowTime) {
return;
}
AskIndex askIndexMsg;
askIndexMsg.set_replicaid(this->replicaId_);
askIndexMsg.set_logidbegin(missedIndices_.first);
askIndexMsg.set_logidend(missedIndices_.second);
// roundRobinIndexAskIdx_ = 0;// Debug
// Do not ask leader every time, choose random replica to ask to avoid
// leader bottleneck
indexRequester_->SendMsgTo(
*(indexAskReceiver_[roundRobinIndexAskIdx_ % replicaNum_]), askIndexMsg,
MessageType::MISSED_INDEX_ASK);
roundRobinIndexAskIdx_++;
if (roundRobinIndexAskIdx_ % replicaNum_ == replicaId_) {
roundRobinIndexAskIdx_++;
}
lastAskMissedIndexTime_ = GetMicrosecondTimestamp();
}
void Replica::AskMissedRequest() {
if (missedReqKeys_.empty()) {
// no need to start timer
return;
}
uint64_t nowTime = GetMicrosecondTimestamp();
if (lastAskMissedIndexTime_ + 50 > nowTime) {
return;
}
AskReq askReqMsg;
askReqMsg.set_replicaid(this->replicaId_);
for (const uint64_t& reqKey : missedReqKeys_) {
askReqMsg.add_missedreqkeys(reqKey);
if ((uint32_t)(askReqMsg.missedreqkeys_size()) >=
requestKeyTransferBatch_) {
reqRequester_->SendMsgTo(
*(requestAskReceiver_[roundRobinRequestAskIdx_ % replicaNum_]),
askReqMsg, MessageType::MISSED_REQ_ASK);
roundRobinRequestAskIdx_++;
if (roundRobinRequestAskIdx_ % replicaNum_ == replicaId_) {
roundRobinRequestAskIdx_++;
}
askReqMsg.clear_missedreqkeys();
}
askTimebyReqKey_[reqKey] = GetMicrosecondTimestamp();
}
if (askReqMsg.missedreqkeys_size() > 0) {
reqRequester_->SendMsgTo(*(requestAskReceiver_[viewId_ % replicaNum_]),
askReqMsg, MessageType::MISSED_REQ_ASK);
roundRobinRequestAskIdx_++;
if (roundRobinRequestAskIdx_ % replicaNum_ == replicaId_) {
roundRobinRequestAskIdx_++;
}
lastAskMissedRequestTime_ = GetMicrosecondTimestamp();
}
}
void Replica::GarbageCollectThread() {
activeWorkerNum_.fetch_add(1);
while (status_ != ReplicaStatus::TERMINATED) {
BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
// Reclaim stale crashVector
ReclaimStaleCrashVector();
// Reclaim (unsynced) stale logs
ReclaimStaleLogs();
// Check LateBuffer and UnSyncedLog items and try to advance
// prepareToClearLateBufferLogId_ and prepareToClearUnSyncedLogId_
PrepareNextReclaim();
}
uint32_t preVal = activeWorkerNum_.fetch_sub(1);
LOG(INFO) << "GarbageCollectThread Terminated " << preVal - 1
<< " worker remaining";
}
void Replica::ReclaimStaleCrashVector() {
uint32_t masterCVVersion = crashVectorInUse_[0].load()->version_;
while (cvVersionToClear_ <= masterCVVersion) {
bool canDelete = true;
for (uint32_t i = 0; i < crashVectorVecSize_; i++) {
if (crashVectorInUse_[i].load()->version_ <= cvVersionToClear_) {
canDelete = false;
break;
}
}
if (canDelete) {
CrashVectorStruct* cvToClear = crashVector_.get(cvVersionToClear_);
crashVector_.erase(cvVersionToClear_);
delete cvToClear;
cvVersionToClear_++;
} else {
break;
}
}
}
void Replica::ReclaimStaleLogs() {
uint32_t safePoint = prepareToClearUnSyncedLogId_;
for (uint32_t shardIdx = 0; shardIdx < fastReplyQu_.size() + 1; shardIdx++) {
safePoint = std::min(safePoint, safeToClearUnSyncedLogId_[shardIdx].load());
}
// Reclaim UnSynced Entries
// Reclaim Entries in late-buffer
safePoint = safeToClearLateBufferLogId_;
}
void Replica::PrepareNextReclaim() {}
void Replica::CheckHeartBeat() {
if (status_ == ReplicaStatus::TERMINATED) {
masterContext_->endPoint_->LoopBreak();
return;
}
if (AmLeader()) {
return;
}
if (status_ != ReplicaStatus::NORMAL) {
// Some worker threads have detected viewchange and switch status_ to
// VIEWCHANGE But workers have no priviledge to increment viewId_ and
// initiate view change process, so the master will do that
VLOG(2) << "InitiateViewChange-10";
InitiateViewChange(viewId_ + 1);
return;
}
uint64_t nowTime = GetMicrosecondTimestamp();
uint64_t threashold = replicaConfig_.heartbeatThresholdMs * 1000;
if (lastHeartBeatTime_ + threashold < nowTime) {
// I haven't heard from the leader for too long, it probably has died
// Before start view change, clear context
VLOG(2) << "InitiateViewChange-1";
InitiateViewChange(viewId_ + 1);
}
}
void Replica::ReceiveMasterMessage(MessageHeader* msgHdr, char* msgBuffer) {
VLOG(4) << "msgType " << (uint32_t)(msgHdr->msgType);
if (msgHdr->msgType == MessageType::VIEWCHANGE_REQ) {
ViewChangeRequest viewChangeReq;
if (viewChangeReq.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
ProcessViewChangeReq(viewChangeReq);
}
} else if (msgHdr->msgType == MessageType::VIEWCHANGE_MSG) {
ViewChange viewChangeMsg;
if (viewChangeMsg.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
ProcessViewChange(viewChangeMsg);
}
} else if (msgHdr->msgType == MessageType::STATE_TRANSFER_REQUEST) {
StateTransferRequest stateTransferReq;
if (stateTransferReq.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
ProcessStateTransferRequest(stateTransferReq);
}
} else if (msgHdr->msgType == MessageType::STATE_TRANSFER_REPLY) {
StateTransferReply stateTransferRep;
if (stateTransferRep.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
ProcessStateTransferReply(stateTransferRep);
}
} else if (msgHdr->msgType == MessageType::START_VIEW) {
StartView startView;
if (startView.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
ProcessStartView(startView);
}
} else if (msgHdr->msgType == MessageType::CRASH_VECTOR_REQUEST) {
CrashVectorRequest request;
if (request.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
ProcessCrashVectorRequest(request);
}
} else if (msgHdr->msgType == MessageType::CRASH_VECTOR_REPLY) {
CrashVectorReply reply;
if (reply.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
VLOG(2) << "CrashVectorReply = " << reply.DebugString();
ProcessCrashVectorReply(reply);
}
} else if (msgHdr->msgType == MessageType::RECOVERY_REQUEST) {
RecoveryRequest request;
if (request.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
ProcessRecoveryRequest(request);
}
} else if (msgHdr->msgType == MessageType::RECOVERY_REPLY) {
RecoveryReply reply;
if (reply.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
ProcessRecoveryReply(reply);
}
} else if (msgHdr->msgType == MessageType::SYNC_STATUS_REPORT) {
SyncStatusReport report;
if (report.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
ProcessSyncStatusReport(report);
}
} else if (msgHdr->msgType == MessageType::COMMIT_INSTRUCTION) {
CommitInstruction commit;
if (commit.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
ProcessCommitInstruction(commit);
}
}
else {
LOG(WARNING) << "Unexpected message type " << (int)msgBuffer[0];
}
}
void Replica::SendViewChangeRequest(const int toReplicaId) {
ViewChangeRequest viewChangeReq;
viewChangeReq.set_view(viewId_);
viewChangeReq.set_replicaid(replicaId_);
CrashVectorStruct* cv = crashVectorInUse_[0].load();
viewChangeReq.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end());
if (toReplicaId < 0) {
// send to all
for (uint32_t i = 0; i < replicaNum_; i++) {
if (i != replicaId_) {
// no need to send to myself
masterContext_->endPoint_->SendMsgTo(
*(masterReceiver_[i]), viewChangeReq, MessageType::VIEWCHANGE_REQ);
}
}
} else {
masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[toReplicaId]),
viewChangeReq,
MessageType::VIEWCHANGE_REQ);
}
}
void Replica::SendViewChange() {
if (AmLeader()) {
// I am the leader of this new view, no need to send to myself
return;
}
ViewChange viewChangeMsg;
viewChangeMsg.set_view(viewId_);
viewChangeMsg.set_replicaid(replicaId_);
CrashVectorStruct* cv = crashVectorInUse_[0].load();
viewChangeMsg.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end());
viewChangeMsg.set_syncpoint(maxSyncedLogEntry_.load()->logId);
if (filteredUnSyncedEntries_.size() > 1) {
viewChangeMsg.set_unsynclogbegin(1);
viewChangeMsg.set_unsynclogend(filteredUnSyncedEntries_.size() - 1);
} else {
viewChangeMsg.set_unsynclogbegin(0);
viewChangeMsg.set_unsynclogend(0);
}
viewChangeMsg.set_lastnormalview(lastNormalView_);
masterContext_->endPoint_->SendMsgTo(
*(masterReceiver_[viewId_ % replicaNum_]), viewChangeMsg,
MessageType::VIEWCHANGE_MSG);
}
void Replica::InitiateViewChange(const uint32_t view) {
if (viewId_ > view) {
LOG(ERROR) << "Invalid view change initiation currentView=" << viewId_
<< "\ttargetView=" << view;
return;
}
if (viewId_ == view && status_ == ReplicaStatus::VIEWCHANGE) {
// Already in viewchange
return;
}
status_ = ReplicaStatus::VIEWCHANGE;
LOG(INFO) << "status =" << (int)status_ << "\t"
<< " view=" << viewId_ << "\t"
<< " targeting view=" << view;
// Wait until every worker stop
while (activeWorkerNum_ > 0) {
usleep(1000);
}
/** Since the update of syncedLogEntryByReqKey_ and syncedLogEntryByLogId_
* may have not been completed when they encounter view change, let's first
* complete (flush) them */
LogEntry* minTrackedEntry = trackedEntry_[0];
for (uint32_t i = 0; i < trackedEntry_.size(); i++) {
if (minTrackedEntry->logId > trackedEntry_[i]->logId) {
minTrackedEntry = trackedEntry_[i];
}
}
while (minTrackedEntry->next) {
LogEntry* next = minTrackedEntry->next;
if (syncedLogEntryByLogId_.get(next->logId) == NULL) {
syncedLogEntryByLogId_.assign(next->logId, next);
syncedLogEntryByReqKey_.assign(next->body.reqKey, next);
}
minTrackedEntry = next;
}
trackedEntry_.assign(trackedEntry_.size(), minTrackedEntry);
LogEntry* entryStart = minUnSyncedLogEntry_;
if (entryStart->logId < CONCURRENT_MAP_START_INDEX) {
// This is dummy, move to its next;
entryStart = entryStart->next;
}
filteredUnSyncedEntries_.clear();
filteredUnSyncedEntries_.resize(
1); // Reserve 1 slot as dummy value [because 0 has special use]
while (entryStart) {
LogEntry* entry = syncedLogEntryByReqKey_.get(entryStart->body.reqKey);
if (!entry) {
// Has not been synced
filteredUnSyncedEntries_.push_back(entryStart);
}
entryStart = entryStart->next;
}
viewId_ = view;
// Unregister all timers, except the monitorTimer (so as the master thread
// can break when status=Terminated)
masterContext_->endPoint_->UnRegisterAllTimers();
masterContext_->endPoint_->RegisterTimer(masterContext_->monitorTimer_);
LOG(INFO) << "Monitor Timer Registered "
<< "viewId=" << viewId_ << "\t"
<< "maxSyncedLogId=" << maxSyncedLogEntry_.load()->logId << "\t"
<< "committedLogId=" << committedLogId_ << "\t"
<< "filteredUnSyncedEntries_.size()="
<< filteredUnSyncedEntries_.size() << "\t"
<< "currentTime=" << GetMicrosecondTimestamp() << "\t";
// Launch viewChange timer
masterContext_->endPoint_->RegisterTimer(viewChangeTimer_);
}
void Replica::BroadcastViewChange() {
if (status_ == ReplicaStatus::NORMAL) {
// Can stop the timer
masterContext_->endPoint_->UnRegisterTimer(viewChangeTimer_);
return;
}
// Broadcast VIEW-CHANGE-REQ to all replicas
SendViewChangeRequest(-1);
// Send VIEW-CHANGE to the leader in this view
SendViewChange();
}
void Replica::SendStartView(const int toReplicaId) {
StartView startView;
startView.set_replicaid(replicaId_);
startView.set_view(viewId_);
CrashVectorStruct* cv = crashVectorInUse_[0];
startView.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end());
// startView.set_syncedlogid(maxSyncedLogId_);
startView.set_syncedlogid(maxSyncedLogEntry_.load()->logId);
if (toReplicaId >= 0) {
// send to one
masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[toReplicaId]),
startView, MessageType::START_VIEW);
} else {
// send to all
for (uint32_t i = 0; i < replicaNum_; i++) {
if (i == replicaId_) {
// No need to send to self
continue;
}
masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[i]), startView,
MessageType::START_VIEW);
VLOG(2) << "Send StartView to " << i << "\t"
<< masterReceiver_[i]->GetIPAsString() << ":"
<< masterReceiver_[i]->GetPortAsInt();
}
}
}
void Replica::SendSyncStatusReport() {
SyncStatusReport report;
report.set_view(viewId_);
report.set_replicaid(replicaId_);
CrashVectorStruct* cv = crashVectorInUse_[0].load();
report.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end());
// report.set_syncedlogid(maxSyncedLogId_);
report.set_syncedlogid(maxSyncedLogEntry_.load()->logId);
if (AmLeader()) {
// leader directly process its own report
ProcessSyncStatusReport(report);
} else {
// send to leader
masterContext_->endPoint_->SendMsgTo(
*(masterReceiver_[viewId_ % replicaNum_]), report,
MessageType::SYNC_STATUS_REPORT);
}
}
void Replica::SendCommit() {
CommitInstruction commit;
commit.set_view(viewId_);
commit.set_replicaid(replicaId_);
CrashVectorStruct* cv = crashVectorInUse_[0].load();
commit.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end());
commit.set_committedlogid(committedLogId_);
// LOG(INFO) << "commit " << commit.DebugString();
for (uint32_t i = 0; i < replicaNum_; i++) {
if (i != replicaId_) {
masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[i]), commit,
MessageType::COMMIT_INSTRUCTION);
}
}
}
void Replica::ProcessViewChangeReq(const ViewChangeRequest& viewChangeReq) {
if (status_ == ReplicaStatus::RECOVERING) {
// Recovering replicas do not participate in view change
return;
}
if (!CheckCV(viewChangeReq.replicaid(), viewChangeReq.cv())) {
// stray message
return;
}
if (Aggregated(viewChangeReq.cv())) {
// If cv is updated, then it is likely that some messages in
// viewChangeSet_ become stray, so remove them
for (uint32_t i = 0; i < replicaNum_; i++) {
auto iter = viewChangeSet_.find(i);
if (iter != viewChangeSet_.end() && (!CheckCV(i, iter->second.cv()))) {
viewChangeSet_.erase(i);
}
}
}
if (viewChangeReq.view() > viewId_) {
VLOG(2) << "InitiateViewChange-2";
InitiateViewChange(viewChangeReq.view());
} else {
if (status_ == ReplicaStatus::NORMAL) {
SendStartView(viewChangeReq.replicaid());
} else {
SendViewChange();
}
}
}
void Replica::ProcessViewChange(const ViewChange& viewChange) {
// LOG(INFO) << "viewChange: " << viewChange.DebugString();
if (status_ == ReplicaStatus::RECOVERING) {
// Recovering replicas do not participate in view change
return;
}
if (!CheckCV(viewChange.replicaid(), viewChange.cv())) {
// stray message
LOG(WARNING) << "Stray Message";
return;
}
Aggregated(viewChange.cv());
if (status_ == ReplicaStatus::NORMAL) {
if (viewChange.view() > viewId_) {
VLOG(2) << "InitiateViewChange-3";
InitiateViewChange(viewChange.view());
} else {
// The sender lags behind
SendStartView(viewChange.replicaid());
}
} else if (status_ == ReplicaStatus::VIEWCHANGE) {
if (viewChange.view() > viewId_) {
VLOG(2) << "InitiateViewChange-4";
InitiateViewChange(viewChange.view());
} else if (viewChange.view() < viewId_) {
SendViewChangeRequest(viewChange.replicaid());
}
// viewChange.view() == viewId
else if (viewChangeSet_.size() >= replicaNum_ / 2 + 1) {
// We have got enough valid viewchange messages, no need for this one
return;
} else {
ASSERT(AmLeader());
viewChangeSet_[viewChange.replicaid()] = viewChange;
VLOG(3) << "viewChangeSet Size=" << viewChangeSet_.size();
// If cv is updated, then it is likely that some messages in
// viewChangeSet_ become stray, so remove them
for (uint32_t i = 0; i < replicaNum_; i++) {
auto iter = viewChangeSet_.find(i);
if (iter != viewChangeSet_.end() && (!CheckCV(i, iter->second.cv()))) {
viewChangeSet_.erase(i);
}
}
if (viewChangeSet_.size() >= replicaNum_ / 2) {
ASSERT(viewChangeSet_.find(replicaId_) == viewChangeSet_.end());
// Got f viewChange
// Plus myself, got f+1 viewChange messages
ViewChange myvc;
CrashVectorStruct* masterCV = crashVectorInUse_[0].load();
myvc.mutable_cv()->Add(masterCV->cv_.begin(), masterCV->cv_.end());
myvc.set_view(viewId_);
myvc.set_replicaid(replicaId_);
// myvc.set_syncpoint(maxSyncedLogId_);
// myvc.set_unsynclogbegin(minUnSyncedLogId_);
// myvc.set_unsynclogend(maxUnSyncedLogId_);
myvc.set_syncpoint(maxSyncedLogEntry_.load()->logId);
if (filteredUnSyncedEntries_.size() > 1) {
myvc.set_unsynclogbegin(1);
myvc.set_unsynclogend(filteredUnSyncedEntries_.size() - 1);
} else {
myvc.set_unsynclogbegin(0);
myvc.set_unsynclogend(0);
}
myvc.set_lastnormalview(lastNormalView_);
viewChangeSet_[replicaId_] = myvc;
// Has got enough viewChange messages, stop viewChangeTimer
masterContext_->endPoint_->UnRegisterTimer(viewChangeTimer_);
TransferSyncedLog();
}
}
} else {
LOG(WARNING) << "Unexpected Status " << status_;
}
}
void Replica::TransferSyncedLog() {
uint32_t largestNormalView = lastNormalView_;
uint32_t maxSyncedLogId = maxSyncedLogEntry_.load()->logId;
uint32_t largestSyncPoint = maxSyncedLogId;
uint32_t targetReplicaId = replicaId_;
transferSyncedEntry_ = true;
for (auto& kv : viewChangeSet_) {
if (largestNormalView < kv.second.lastnormalview()) {
largestNormalView = kv.second.lastnormalview();
}
}
for (auto& kv : viewChangeSet_) {
if (kv.second.lastnormalview() == largestNormalView &&
largestSyncPoint < kv.second.syncpoint()) {
largestSyncPoint = kv.second.syncpoint();
targetReplicaId = kv.second.replicaid();
}
}
stateTransferIndices_.clear();
VLOG(3) << "maxSyncedLogId_=" << maxSyncedLogId << "\t"
<< "largestSyncPoint=" << largestSyncPoint << "\t"
<< "largestNormalView = " << largestNormalView << "\t"
<< "lastNormalView_=" << lastNormalView_;
// Directly copy the synced entries
if (largestNormalView == lastNormalView_) {
if (maxSyncedLogId < largestSyncPoint) {
stateTransferIndices_[targetReplicaId] = {maxSyncedLogId + 1,
largestSyncPoint};
}
// Else: no need to do state transfer, because this replica has all synced
// entries
} else {
stateTransferIndices_[targetReplicaId] = {committedLogId_ + 1,
largestSyncPoint};
}
if (!stateTransferIndices_.empty()) {
// Start state transfer
// After this state transfer has been completed, continue to execute the
// callback (MergeUnsyncedLog)
stateTransferCallback_ = std::bind(&Replica::TransferUnSyncedLog, this);
stateTransferTerminateTime_ =
GetMicrosecondTimestamp() + stateTransferTimeout_ * 1000;
stateTransferTerminateCallback_ =
std::bind(&Replica::RollbackToViewChange, this);
LOG(INFO) << "Start state transfer targetReplica " << targetReplicaId
<< "\t"
<< "seg=" << stateTransferIndices_[targetReplicaId].first << "\t"
<< stateTransferIndices_[targetReplicaId].second;
// Start the state tranfer timer
masterContext_->endPoint_->RegisterTimer(stateTransferTimer_);
} else {
// Directly go to the second stage: transfer unsynced log
TransferUnSyncedLog();
}
}
void Replica::TransferUnSyncedLog() {
// Get the unsynced logs from the f+1 remaining replicas
// If this process cannot be completed, rollback to view change
uint32_t largestNormalView = lastNormalView_;
transferSyncedEntry_ = false;
for (auto& kv : viewChangeSet_) {
if (largestNormalView < kv.second.lastnormalview()) {
largestNormalView = kv.second.lastnormalview();
}
}
VLOG(3) << "TransferUnSyncedLog largestNormalView=" << largestNormalView;
stateTransferIndices_.clear();
for (auto& kv : viewChangeSet_) {
if (kv.second.lastnormalview() < largestNormalView) {
// No need to transfer log, this guy's unsynced logs do not contribute
// to committed logs
continue;
}
if (kv.first == replicaId_) {
// No need to transfer log entries from self
continue;
}
if (kv.second.unsynclogbegin() == 0 && kv.second.unsynclogend() == 0) {
// This replica has no unsynced logs
continue;
}
// request transfer of the filteredUnSyncedRequests vec
stateTransferIndices_[kv.first] = {kv.second.unsynclogbegin(),
kv.second.unsynclogend()};
}
if (stateTransferIndices_.empty()) {
// No need to do state transfer for unsynced logs
// Directly go to new view
EnterNewView();
return;
}
// After this state transfer is completed, this replica will enter the new
// view
stateTransferCallback_ = std::bind(&Replica::MergeUnSyncedLog, this);
stateTransferTerminateTime_ =
GetMicrosecondTimestamp() + stateTransferTimeout_ * 1000;
stateTransferTerminateCallback_ =
std::bind(&Replica::RollbackToViewChange, this);
masterContext_->endPoint_->RegisterTimer(stateTransferTimer_);
}
void Replica::MergeUnSyncedLog() {
int f = replicaNum_ / 2;
int quorum = (f % 2 == 0) ? (f / 2 + 1) : (f / 2 + 2);
SHA_HASH dummy;
for (auto& kv : requestsToMerge_) {
uint64_t reqKey = kv.first.second;
LogEntry* entry = kv.second.first;
int count = kv.second.second;
if (count >= quorum) {
if (syncedLogEntryByReqKey_.get(reqKey)) {
// at-most once
delete entry;
continue;
}
ProcessRequest(entry, true, false, true);
syncedLogEntryByReqKey_.assign(reqKey, entry);
syncedLogEntryByLogId_.assign(entry->logId, entry);
}
}
requestsToMerge_.clear();
EnterNewView();
}
void Replica::EnterNewView() {
LOG(INFO) << "Enter New View " << viewId_
<< " maxSyncedLog =" << maxSyncedLogEntry_.load()->logId << "\t"
<< GetMicrosecondTimestamp();
// Leader sends StartView to all the others
if (AmLeader()) {
SendStartView(-1);
} // Else: followers directly start
status_ = ReplicaStatus::NORMAL;
lastNormalView_.store(viewId_);
// Update crashVector, all synced with master
CrashVectorStruct* masterCV = crashVectorInUse_[0].load();
for (uint32_t i = 1; i < crashVectorVecSize_; i++) {
crashVectorInUse_[i] = masterCV;
}
crashVector_.assign(masterCV->version_, masterCV);
// More lightweight than CreateContext
ResetContext();
// Notify the blocking workers until all workers become active
while (activeWorkerNum_ < totalWorkerNum_) {
waitVar_.notify_all();
usleep(1000);
}
LOG(INFO) << "View=" << viewId_
<< " Recovered worker number:" << activeWorkerNum_;
}
void Replica::SendStateTransferRequest() {
if (GetMicrosecondTimestamp() >= stateTransferTerminateTime_) {
// If statetransfer cannot be completed within a certain amount of time,
// rollback to view change
masterContext_->endPoint_->UnRegisterTimer(stateTransferTimer_);
LOG(INFO)
<< "The state transfer takes too long, roll back to previous step ";
stateTransferTerminateCallback_();
return;
}
StateTransferRequest request;
request.set_view(viewId_);
request.set_issynced(transferSyncedEntry_);
request.set_replicaid(replicaId_);
for (auto& stateTransferInfo : stateTransferIndices_) {
// Do not request too many entries at one time, otherwise, UDP packet
// cannot handle that
uint32_t targetReplica = stateTransferInfo.first;
uint32_t logBegin = stateTransferInfo.second.first;
uint32_t logEnd = stateTransferInfo.second.second;
request.set_logbegin(logBegin);
if (logBegin + requestTrasnferBatch_ <= logEnd) {
request.set_logend(logBegin + requestTrasnferBatch_);
} else {
request.set_logend(logEnd);
}
VLOG(3) << "I am asking stateTransferRequest from " << targetReplica << "\t"
<< request.logbegin() << "\t" << request.logend() << "\t"
<< "\tisSynced=" << request.issynced();
masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[targetReplica]),
request,
MessageType::STATE_TRANSFER_REQUEST);
}
}
void Replica::ProcessStateTransferRequest(
const StateTransferRequest& stateTransferRequest) {
VLOG(3) << "stateTransferRequest from Replica-"
<< stateTransferRequest.replicaid() << "\t||"
<< stateTransferRequest.logbegin() << "\t"
<< stateTransferRequest.logend() << "\tisSynced "
<< stateTransferRequest.issynced()
<< " view=" << stateTransferRequest.view();
if (stateTransferRequest.view() != viewId_) {
if (stateTransferRequest.view() > viewId_) {
VLOG(2) << "InitiateViewChange-5";
InitiateViewChange(stateTransferRequest.view());
}
return;
}
StateTransferReply reply;
CrashVectorStruct* cv = crashVectorInUse_[0].load();
const Address* requesterAddr =
masterReceiver_[stateTransferRequest.replicaid()];
reply.set_replicaid(replicaId_);
reply.set_view(viewId_);
reply.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end());
reply.set_issynced(stateTransferRequest.issynced());
if (reply.issynced()) {
reply.set_logbegin(stateTransferRequest.logbegin());
ASSERT(maxSyncedLogEntry_.load()->logId >= stateTransferRequest.logend());
for (uint32_t j = stateTransferRequest.logbegin();
j <= stateTransferRequest.logend(); j++) {
LogEntry* entry = syncedLogEntryByLogId_.get(j);
if (entry) {
RequestBodyToMessage(entry->body, reply.add_reqs());
reply.set_logend(j);
} else {
LOG(WARNING) << "Maybe just due to lag "
<< stateTransferRequest.logend() << ">" << reply.logend();
break;
}
}
VLOG(3) << "State Reply " << reply.logbegin() << "--" << reply.logend();
} else {
reply.set_logbegin(stateTransferRequest.logbegin());
reply.set_logend(stateTransferRequest.logend());
ASSERT(filteredUnSyncedEntries_.size() > reply.logend());
for (uint32_t j = reply.logbegin(); j <= reply.logend(); j++) {
LogEntry* entry = filteredUnSyncedEntries_[j];
ASSERT(entry != NULL);
RequestBodyToMessage(entry->body, reply.add_reqs());
}
VLOG(3) << "Give " << reply.logbegin() << "-" << reply.logend();
}
if (reply.reqs_size() > 0) {
masterContext_->endPoint_->SendMsgTo(*requesterAddr, reply,
MessageType::STATE_TRANSFER_REPLY);
}
}
void Replica::ProcessStateTransferReply(
const StateTransferReply& stateTransferReply) {
VLOG(3) << "Receive some state " << stateTransferReply.logbegin() << "--"
<< stateTransferReply.logend()
<< " view=" << stateTransferReply.view() << "--- "
<< transferSyncedEntry_ << "==" << stateTransferReply.issynced();
if (status_ == ReplicaStatus::NORMAL) {
// Normal replicas do not need state transfer
return;
}
if (!CheckCV(stateTransferReply.replicaid(), stateTransferReply.cv())) {
return;
} else {
Aggregated(stateTransferReply.cv());
}
if (!(masterContext_->endPoint_->isTimerRegistered(stateTransferTimer_))) {
// We are not doing state transfer, so ignore this message
return;
}
if (stateTransferReply.view() < viewId_) {
// Old view: ignore
return;
} else if (stateTransferReply.view() > viewId_) {
masterContext_->endPoint_->UnRegisterTimer(stateTransferTimer_);
if (status_ == ReplicaStatus::RECOVERING) {
// This state transfer is useless, stop it and restart recovery request
masterContext_->endPoint_->RegisterTimer(recoveryRequestTimer_);
} else if (status_ == ReplicaStatus::VIEWCHANGE) {
VLOG(2) << "InitiateViewChange-6";
InitiateViewChange(stateTransferReply.view());
} else {
LOG(ERROR) << "Unknown replica status " << (uint32_t)status_;
}
return;
}
// Else: Same view
if (transferSyncedEntry_ != stateTransferReply.issynced()) {
return;
}
const auto& iter = stateTransferIndices_.find(stateTransferReply.replicaid());
if (iter == stateTransferIndices_.end() ||
stateTransferReply.logend() < iter->second.first) {
// We do not need these log entries
return;
}
// So long as the state transfer is making progress, we should give it more
// time instead of early termination
// Only if the state transfer has not made progress within
// stateTransferTimeout_. then we terminate it and rollback to some previous
// function
stateTransferTerminateTime_ =
GetMicrosecondTimestamp() + +stateTransferTimeout_ * 1000;
SHA_HASH dummy;
if (stateTransferReply.issynced()) {
// This is the state-transfer for synced requests
for (uint32_t i = iter->second.first; i <= stateTransferReply.logend();
i++) {
const RequestBodyMsg& rbMsg =
stateTransferReply.reqs(i - iter->second.first);
LogEntry* entry = new LogEntry(
rbMsg.deadline(), rbMsg.reqkey(), rbMsg.key(), rbMsg.proxyid(),
rbMsg.command(), rbMsg.iswrite(), dummy, dummy);
ProcessRequest(entry, true, false, false);
// LOG(INFO) << "Processed " << entry->logId << "\t"
// << maxSyncedLogEntry_.load()->logId;
// Register
if (syncedLogEntryByReqKey_.get(entry->body.reqKey) == NULL) {
syncedLogEntryByReqKey_.assign(entry->body.reqKey, entry);
syncedLogEntryByLogId_.assign(entry->logId, entry);
if (entry->logId > CONCURRENT_MAP_START_INDEX) {
ASSERT(syncedLogEntryByLogId_.get(entry->logId - 1) != NULL);
ASSERT(syncedLogEntryByLogId_.get(entry->logId - 1) == entry->prev);
}
}
}
} else {
// This is the state-transfer for unsynced request (log merge)
for (int i = 0; i < stateTransferReply.reqs_size(); i++) {
const RequestBodyMsg& rbMsg = stateTransferReply.reqs(i);
std::pair key(rbMsg.deadline(), rbMsg.reqkey());
if (requestsToMerge_.find(key) != requestsToMerge_.end()) {
LogEntry* entry = new LogEntry(
rbMsg.deadline(), rbMsg.reqkey(), rbMsg.key(), rbMsg.proxyid(),
rbMsg.command(), rbMsg.iswrite(), dummy, dummy);
requestsToMerge_[key] = {entry, 1};
} else {
requestsToMerge_[key].second++;
}
}
}
iter->second.first = stateTransferReply.logend() + 1;
VLOG(2) << "Transfer Synced? " << stateTransferReply.issynced() << "\t"
<< " In Progress: " << iter->first << ":" << iter->second.first << "-"
<< iter->second.second;
uint32_t remainingPercent =
stateTransferIndicesRef_[stateTransferReply.replicaid()].second;
if (remainingPercent > 10) {
uint32_t previousGap =
stateTransferIndicesRef_[stateTransferReply.replicaid()].first;
uint32_t remainingGap = iter->second.second - iter->second.first;
if (remainingGap * 100 / previousGap < remainingPercent) {
LOG(INFO) << "State Tranfer from Replica "
<< stateTransferReply.replicaid() << "\t" << remainingPercent
<< "\% of progress (i.e., " << remainingGap
<< " logs) remaining\t"
<< "Current committedLogId_=" << committedLogId_
<< "\tmaxSyncedLogId=" << maxSyncedLogEntry_.load()->logId;
;
stateTransferIndicesRef_[stateTransferReply.replicaid()].second -= 10;
}
}
if (iter->second.first > iter->second.second) {
// We have completed the state transfer for this target replica
stateTransferIndices_.erase(iter->first);
}
if (stateTransferIndices_.empty()) {
// This state transfer is completed, unregister the timer
masterContext_->endPoint_->UnRegisterTimer(stateTransferTimer_);
stateTransferIndices_.clear();
stateTransferIndicesRef_.clear();
// If we have a callback, then call it
if (stateTransferCallback_) {
stateTransferCallback_();
}
}
}
void Replica::RewindSyncedLogTo(uint32_t rewindPoint) {
LOG(INFO) << "maxSyncedLogId_=" << maxSyncedLogEntry_.load()->logId << "\t"
<< "rewindPoint=" << rewindPoint;
LogEntry* entryStart = maxSyncedLogEntry_;
while (entryStart->logId > rewindPoint) {
LogEntry* entryToDel = entryStart;
if (entryToDel->prevNonCommutative) {
entryToDel->prevNonCommutative->nextNonCommutative = NULL;
}
if (entryToDel->prev) {
entryToDel->prev->next = NULL;
}
ASSERT(entryStart->prev != NULL);
syncedLogEntryByReqKey_.erase(entryToDel->body.reqKey);
syncedLogEntryByLogId_.erase(entryToDel->logId);
entryStart = entryStart->prev;
delete entryToDel;
}
entryStart->next = NULL;
entryStart->nextNonCommutative = NULL;
maxSyncedLogEntry_ = entryStart;
trackedEntry_.assign(trackedEntry_.size(), maxSyncedLogEntry_);
}
void Replica::ProcessStartView(const StartView& startView) {
VLOG(3) << startView.DebugString();
if (!CheckCV(startView.replicaid(), startView.cv())) {
return;
} else {
Aggregated(startView.cv());
}
if (status_ == ReplicaStatus::VIEWCHANGE) {
if (startView.view() > viewId_) {
VLOG(2) << "InitiateViewChange-7";
InitiateViewChange(startView.view());
} else if (startView.view() == viewId_) {
if (committedLogId_ < startView.syncedlogid()) {
// Start StateTransfer
if (masterContext_->endPoint_->isTimerRegistered(stateTransferTimer_)) {
// LOG(INFO) << "StateTransfer In Progress:"
// << stateTransferIndices_[startView.replicaid()].first
// << "--"
// << stateTransferIndices_[startView.replicaid()].second;
return;
}
RewindSyncedLogTo(committedLogId_);
stateTransferIndices_.clear();
stateTransferIndicesRef_[startView.replicaid()] = {committedLogId_ + 1,
100};
stateTransferIndices_[startView.replicaid()] = {
committedLogId_ + 1, startView.syncedlogid()};
stateTransferIndicesRef_[startView.replicaid()] = {committedLogId_ + 1,
100};
stateTransferCallback_ = std::bind(&Replica::EnterNewView, this);
stateTransferTerminateTime_ =
GetMicrosecondTimestamp() + stateTransferTimeout_ * 1000;
stateTransferTerminateCallback_ =
std::bind(&Replica::RollbackToViewChange, this);
transferSyncedEntry_ = true;
masterContext_->endPoint_->RegisterTimer(stateTransferTimer_);
} else {
RewindSyncedLogTo(committedLogId_);
EnterNewView();
}
} // else: startView.view() viewId_) {
VLOG(2) << "InitiateViewChange-8";
InitiateViewChange(startView.view());
} else if (startView.view() < viewId_) {
// My view is fresher
SendStartView(startView.replicaid());
}
// Else: We are in the same view and this replica is normal, no need
// startView
}
// If status == RECOVERING, it does not participate in view change
}
void Replica::BroadcastCrashVectorRequest() {
CrashVectorRequest request;
boost::uuids::random_generator generator;
boost::uuids::uuid uuid = generator();
nonce_ = boost::uuids::to_string(uuid);
request.set_nonce(nonce_);
request.set_replicaid(replicaId_);
crashVectorReplySet_.clear();
for (uint32_t i = 0; i < replicaNum_; i++) {
if (i == replicaId_) {
continue;
}
LOG(INFO) << "Ask CrashVector to Replica " << i;
masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[i]), request,
MessageType::CRASH_VECTOR_REQUEST);
}
}
void Replica::BroadcastRecoveryRequest() {
RecoveryRequest request;
CrashVectorStruct* cv = crashVectorInUse_[0].load();
request.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end());
request.set_replicaid(replicaId_);
for (uint32_t i = 0; i < replicaNum_; i++) {
if (i == replicaId_) {
continue;
}
masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[i]), request,
MessageType::RECOVERY_REQUEST);
}
}
void Replica::ProcessCrashVectorRequest(const CrashVectorRequest& request) {
if (status_ != ReplicaStatus::NORMAL) {
return;
}
CrashVectorReply reply;
reply.set_nonce(request.nonce());
reply.set_replicaid(replicaId_);
CrashVectorStruct* cv = crashVectorInUse_[0].load();
reply.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end());
masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[request.replicaid()]),
reply, MessageType::CRASH_VECTOR_REPLY);
}
void Replica::ProcessCrashVectorReply(const CrashVectorReply& reply) {
if (status_ != ReplicaStatus::RECOVERING) {
LOG(INFO) << "nolong Recovering " << status_;
return;
}
if (nonce_ != reply.nonce()) {
LOG(INFO) << "nonce inconistent " << crashVectorReplySet_.size();
return;
}
if (masterContext_->endPoint_->isTimerRegistered(crashVectorRequestTimer_) ==
false) {
// We no longer request crash vectors
LOG(INFO) << "no longer register crashVectorRequest "
<< crashVectorReplySet_.size();
return;
}
crashVectorReplySet_[reply.replicaid()] = reply;
if (crashVectorReplySet_.size() >= replicaNum_ / 2 + 1) {
// Got enough quorum
CrashVectorStruct* oldCV = crashVectorInUse_[0].load();
CrashVectorStruct* newCV = new CrashVectorStruct(*oldCV);
newCV->version_++;
for (const auto& kv : crashVectorReplySet_) {
for (uint32_t i = 0; i < replicaNum_; i++) {
if (kv.second.cv(i) > newCV->cv_[i]) {
newCV->cv_[i] = kv.second.cv(i);
}
}
}
// Increment self counter
newCV->cv_[replicaId_]++;
crashVector_.assign(newCV->version_, newCV);
for (uint32_t i = 0; i < crashVectorVecSize_; i++) {
crashVectorInUse_[i] = newCV;
}
masterContext_->endPoint_->UnRegisterTimer(crashVectorRequestTimer_);
crashVectorReplySet_.clear();
// Start Recovery Request
masterContext_->endPoint_->RegisterTimer(recoveryRequestTimer_);
}
}
void Replica::ProcessRecoveryRequest(const RecoveryRequest& request) {
if (status_ != ReplicaStatus::NORMAL) {
return;
}
if (!CheckCV(request.replicaid(), request.cv())) {
return;
} else {
Aggregated(request.cv());
}
RecoveryReply reply;
CrashVectorStruct* cv = crashVectorInUse_[0].load();
reply.set_replicaid(replicaId_);
reply.set_view(viewId_);
reply.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end());
reply.set_syncedlogid(maxSyncedLogEntry_.load()->logId);
masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[request.replicaid()]),
reply, MessageType::RECOVERY_REPLY);
}
void Replica::ProcessRecoveryReply(const RecoveryReply& reply) {
if (!CheckCV(reply.replicaid(), reply.cv())) {
return;
} else {
if (Aggregated(reply.cv())) {
// If cv is updated, then it is likely that some messages in
// recoveryReplySet_ become stray, so remove them
for (uint32_t i = 0; i < replicaNum_; i++) {
auto iter = recoveryReplySet_.find(i);
if (iter != recoveryReplySet_.end() &&
(!CheckCV(i, iter->second.cv()))) {
recoveryReplySet_.erase(i);
}
}
}
}
if (masterContext_->endPoint_->isTimerRegistered(recoveryRequestTimer_) ==
false) {
// We no longer request recovery reply
return;
}
recoveryReplySet_[reply.replicaid()] = reply;
if (recoveryReplySet_.size() >= replicaNum_ / 2 + 1) {
// Got enough quorum
masterContext_->endPoint_->UnRegisterTimer(recoveryRequestTimer_);
uint32_t maxView = 0;
uint32_t syncedLogId = 0;
for (const auto& kv : recoveryReplySet_) {
if (kv.second.view() > maxView) {
maxView = kv.second.view();
syncedLogId = kv.second.syncedlogid();
}
}
// Get the maxView, launch state transfer with the corresponding leader
viewId_ = maxView;
recoveryReplySet_.clear();
LOG(INFO) << "Replica intends to enter View " << viewId_
<< " after recovery; the number of logs to recover is:"
<< syncedLogId;
if (AmLeader()) {
LOG(INFO) << "The recovered replica will become the leader in this view, "
"skip it!";
// If the recoverying replica happens to be the leader of the new view,
// don't participate. Wait until the healthy replicas elect a new leader
usleep(1000); // sleep some time and restart the recovery process
masterContext_->endPoint_->RegisterTimer(recoveryRequestTimer_);
} else {
// Launch state transfer for synced log entries
stateTransferIndices_.clear();
if (syncedLogId >= CONCURRENT_MAP_START_INDEX) {
// There are some synced log entries that should be transferred
transferSyncedEntry_ = true;
stateTransferIndices_[maxView % replicaNum_] = {
CONCURRENT_MAP_START_INDEX, syncedLogId};
stateTransferIndicesRef_[maxView % replicaNum_] = {
syncedLogId - CONCURRENT_MAP_START_INDEX + 1, 100};
LOG(INFO) << "Recover Logs from " << CONCURRENT_MAP_START_INDEX
<< "\t to\t" << syncedLogId;
stateTransferCallback_ = std::bind(&Replica::EnterNewView, this);
stateTransferTerminateTime_ =
GetMicrosecondTimestamp() + stateTransferTimeout_ * 1000;
stateTransferTerminateCallback_ =
std::bind(&Replica::RollbackToRecovery, this);
masterContext_->endPoint_->RegisterTimer(stateTransferTimer_);
} else {
// No log entries to recover, directly enter new view
EnterNewView();
}
}
}
}
void Replica::ProcessSyncStatusReport(const SyncStatusReport& report) {
if (!CheckCV(report.replicaid(), report.cv())) {
// Stray message
return;
} else {
if (Aggregated(report.cv())) {
// Possibly make existing msg become stray
for (uint32_t i = 0; i < replicaId_; i++) {
auto iter = syncStatusSet_.find(i);
if (iter != syncStatusSet_.end() && (!CheckCV(i, iter->second.cv()))) {
syncStatusSet_.erase(i);
}
}
}
}
if (!CheckView(report.view())) {
return;
}
auto iter = syncStatusSet_.find(report.replicaid());
if (iter == syncStatusSet_.end() ||
iter->second.syncedlogid() < report.syncedlogid()) {
syncStatusSet_[report.replicaid()] = report;
}
// LOG(INFO) << "sync size=" << syncStatusSet_.size();
if (syncStatusSet_.size() >= replicaNum_ / 2 + 1) {
uint32_t minLogId = UINT32_MAX;
for (const auto& kv : syncStatusSet_) {
if (minLogId > kv.second.syncedlogid()) {
minLogId = kv.second.syncedlogid();
}
}
// LOG(INFO) << "minLogId=" << minLogId << "\t" << committedLogId_;
if (minLogId >= committedLogId_) {
committedLogId_ = minLogId;
// LOG(INFO) << "syncStauts " << report.DebugString();
SendCommit();
}
}
}
void Replica::ProcessCommitInstruction(const CommitInstruction& commit) {
if (!CheckCV(commit.replicaid(), commit.cv())) {
return;
} else {
Aggregated(commit.cv());
}
if (!CheckView(commit.view())) {
return;
}
lastHeartBeatTime_ = GetMicrosecondTimestamp();
// LOG(INFO) << "commit " << commit.DebugString();
// Buggy: should compare with syncedLogId, to see whether log is missing
if (commit.committedlogid() > committedLogId_) {
// Don't assign committedLogId_ directly, because this replica may have
// not get enough synced logs
toCommitLogId_ = commit.committedlogid();
// LOG(INFO) << "committedLogId_=" << committedLogId_;
}
uint32_t nextCommitId = maxSyncedLogEntry_.load()->logId;
if (toCommitLogId_ < nextCommitId) {
nextCommitId = toCommitLogId_;
}
while (committedLogId_ < nextCommitId) {
if (committedLogId_ < CONCURRENT_MAP_START_INDEX) {
committedLogId_++;
continue;
}
uint32_t preFetchTrackedLogId = trackedEntry_[0]->logId;
LogEntry* entry = syncedLogEntryByLogId_.get(committedLogId_);
if (entry == NULL) {
if (committedLogId_ <= preFetchTrackedLogId) {
LOG(INFO) << "committedLogId_=" << committedLogId_ << "\t"
<< "maxSyncedLogId_=" << maxSyncedLogEntry_.load()->logId
<< "\ttrackedLogId =" << preFetchTrackedLogId;
for (uint32_t i = CONCURRENT_MAP_START_INDEX;
i <= trackedEntry_[0]->logId; i++) {
if (syncedLogEntryByLogId_.get(i) == NULL) {
LOG(INFO) << "log " << i << " not recorded";
}
}
LOG(ERROR) << "abnormal exit";
exit(0);
}
if (viewId_ == 1) {
LOG(INFO) << "committedLogId_=" << committedLogId_ << "\t"
<< "maxSyncedLogId_=" << maxSyncedLogEntry_.load()->logId
<< "\t"
<< "\ttrackedLogId =" << trackedEntry_[0]->logId;
}
break;
}
ASSERT(entry != NULL);
entry->result = ApplicationExecute(entry->body);
committedLogId_++;
// if (committedLogId_ % 1000 == 0) {
// LOG(INFO) << "committedLogId_=" << committedLogId_ << "\t"
// << "maxSyncedLogId_=" << maxSyncedLogEntry_.load()->logId;
// }
}
}
bool Replica::CheckView(const uint32_t view, const bool isMaster) {
if (view < viewId_) {
// old message
return false;
}
if (view > viewId_) {
if (isMaster) {
if (status_ != ReplicaStatus::RECOVERING) {
// Recovering replicas do not participate in view change
VLOG(2) << "InitiateViewChange-9: " << view
<< "\t currentView=" << viewId_ << "\t"
<< "td=" << pthread_self();
InitiateViewChange(view);
}
} else {
// new view, update status and wait for master thread to handle the
// situation
status_ = ReplicaStatus::VIEWCHANGE;
}
return false;
}
return true;
}
bool Replica::CheckCV(const uint32_t senderId,
const google::protobuf::RepeatedField& cv) {
CrashVectorStruct* masterCV = crashVectorInUse_[0].load();
return (cv.at(senderId) >= masterCV->cv_[senderId]);
}
bool Replica::Aggregated(const google::protobuf::RepeatedField& cv) {
CrashVectorStruct* masterCV = crashVectorInUse_[0].load();
std::vector maxCV(masterCV->cv_);
bool needAggregate = false;
for (uint32_t i = 0; i < replicaNum_; i++) {
if (maxCV[i] < cv.at(i)) {
// The incoming cv has fresher elements
needAggregate = true;
maxCV[i] = cv.at(i);
}
}
if (needAggregate) {
CrashVectorStruct* newCV =
new CrashVectorStruct(maxCV, masterCV->version_ + 1);
crashVector_.assign(newCV->version_, newCV);
crashVectorInUse_[0] = newCV;
if (status_ == ReplicaStatus::NORMAL) {
// Wait until the reply threads has known the new cv
while (true) {
bool ready = true;
for (uint32_t i = 1; i <= fastReplyQu_.size(); i++) {
if (crashVectorInUse_[i].load()->version_ < newCV->version_) {
ready = false;
}
}
if (ready) {
break;
} else {
usleep(1000);
}
}
} // Else (status_=ViewChange), then there is only master thread alive,
// no need to wait for reply thread
}
return needAggregate;
}
void Replica::RollbackToViewChange() {
LOG(INFO) << "Rollback to restart view change";
status_ = ReplicaStatus::VIEWCHANGE;
viewChangeSet_.clear();
if (false == masterContext_->endPoint_->isTimerRegistered(viewChangeTimer_)) {
masterContext_->endPoint_->RegisterTimer(viewChangeTimer_);
}
}
void Replica::RollbackToRecovery() {
LOG(INFO) << "Rollback to restart recovery";
status_ = ReplicaStatus::RECOVERING;
recoveryReplySet_.clear();
// Since we start a new round of recovery, the logs obtained from the
// previous round (if any) will not count. Delete them (=clean state) and
// restart
LogEntry* entryStart = syncedLogEntryHead_->next;
while (entryStart) {
LogEntry* entryToDel = entryStart;
entryStart = entryStart->next;
delete entryToDel;
}
maxSyncedLogEntry_ = syncedLogEntryHead_;
maxSyncedLogEntryByKey_.assign(keyNum_, NULL);
if (false ==
masterContext_->endPoint_->isTimerRegistered(recoveryRequestTimer_)) {
masterContext_->endPoint_->RegisterTimer(recoveryRequestTimer_);
}
}
std::string Replica::ApplicationExecute(const RequestBody& request) {
return "";
}
bool Replica::AmLeader() { return (viewId_ % replicaNum_ == replicaId_); }
} // namespace nezha
================================================
FILE: replica/replica.h
================================================
#ifndef NEZHA_REPLICA_H
#define NEZHA_REPLICA_H
#include
#include
#include
#include
#include
#include
#include "lib/utils.h"
#include "proto/nezha_proto.pb.h"
#include "replica_config.h"
namespace nezha {
using namespace nezha::proto;
/** Receiver is more complex than sender. A sender only needs an endpoint.
* But A Receiver needs an endpoint (endPoint_) to receive messages, and the
* message should be handled bu an already-registered handler (msgHandlerFunc_).
* Besides, in order to unblock the endpoint during view change, there is also a
* timer (monitorTimer_) needed, to keep monitor the status of the replica.
*
* We package all the necessary components into ReceiverContext for brievity
*/
struct ReceiverContext {
Endpoint* endPoint_;
void* context_;
MessageHandlerFunc msgHandlerFunc_;
Timer* monitorTimer_;
ReceiverContext(Endpoint* ep = NULL, void* ctx = NULL,
MessageHandlerFunc msgFunc = nullptr, Timer* t = NULL)
: endPoint_(ep),
context_(ctx),
msgHandlerFunc_(msgFunc),
monitorTimer_(t) {}
void Register(int endpointType = EndpointType::UDP_ENDPOINT) {
if (endpointType == EndpointType::UDP_ENDPOINT) {
// UDP Endpoint
UDPMsgHandler* udpMsgHandler =
new UDPMsgHandler(msgHandlerFunc_, context_);
((UDPSocketEndpoint*)endPoint_)->RegisterMsgHandler(udpMsgHandler);
((UDPSocketEndpoint*)endPoint_)->RegisterTimer(monitorTimer_);
} else {
// To support other types of endpoints later
LOG(ERROR) << "unknown endpoint type " << (int)endpointType;
}
}
};
/**
* Refer to replica_run.cc, the runnable program only needs to instantiate a
* Replica object with a configuration file. Then it calls Run() method to run
* and calls Terminate() method to stop
*/
class Replica {
private:
/** All the configuration parameters for the replica are included in
* replicaConfig_*/
ReplicaConfig replicaConfig_;
/** 1 for UDP, 2 for GRPC (not supported yet) */
int endPointType_;
/** viewId_ starts from 0 */
std::atomic viewId_;
std::atomic lastNormalView_;
/** replicaId_ starts from 0 */
std::atomic replicaId_;
std::atomic