Repository: Steamgjk/Nezha Branch: main Commit: 8db31f04af48 Files: 78 Total size: 416.5 KB Directory structure: gitextract_rknzum1x/ ├── .github/ │ └── workflows/ │ └── build.yaml ├── .gitignore ├── .vscode/ │ └── settings.json ├── README.md ├── WORKSPACE ├── client/ │ ├── BUILD │ ├── client.cc │ ├── client.h │ ├── client_config.h │ └── client_run.cc ├── configs/ │ ├── dist/ │ │ ├── nezha-client-config.yaml │ │ ├── nezha-proxy-config.yaml │ │ ├── nezha-replica-config-0.yaml │ │ ├── nezha-replica-config-1.yaml │ │ ├── nezha-replica-config-2.yaml │ │ └── nezha-replica-config.yaml │ ├── local/ │ │ ├── nezha-client-config.yaml │ │ ├── nezha-proxy-config.yaml │ │ ├── nezha-replica-config-0.yaml │ │ ├── nezha-replica-config-1.yaml │ │ └── nezha-replica-config-2.yaml │ ├── nezha-client-config-template.yaml │ ├── nezha-proxy-config-template.yaml │ └── nezha-replica-config-template.yaml ├── docs/ │ ├── Nezha.tla │ ├── demo.md │ └── tla-intro.md ├── external/ │ ├── gogoprotobuf.BUILD │ └── googleapi.BUILD ├── lib/ │ ├── BUILD │ ├── Rules.mk │ ├── address.cc │ ├── address.h │ ├── common_struct.h │ ├── common_type.h │ ├── endpoint.cc │ ├── endpoint.h │ ├── message_handler.h │ ├── message_type.cc │ ├── message_type.h │ ├── timer.h │ ├── udp_socket_endpoint.cc │ ├── udp_socket_endpoint.h │ ├── utils.cc │ ├── utils.h │ └── zipfian.h ├── license.md ├── micro-bench/ │ ├── BUILD │ ├── analysis.cc │ ├── bench_receiver.cc │ ├── bench_sender.cc │ └── launch_micro.py ├── proto/ │ ├── BUILD │ └── nezha_proto.proto ├── proxy/ │ ├── BUILD │ ├── proxy.cc │ ├── proxy.h │ ├── proxy_config.h │ └── proxy_run.cc ├── replica/ │ ├── BUILD │ ├── replica.cc │ ├── replica.h │ ├── replica_config.h │ └── replica_run.cc ├── scripts/ │ ├── analysis.py │ ├── launch.py │ ├── local_test.sh │ └── ttcs-agent.cfg ├── third_party/ │ ├── concurrentqueue/ │ │ └── BUILD.bazel │ ├── glog/ │ │ ├── BUILD.bazel │ │ ├── BUILD.glog │ │ └── glog.bzl │ ├── junction/ │ │ ├── BUILD.bazel │ │ └── junction.patch │ ├── libev/ │ │ └── BUILD.bazel │ ├── openssl/ │ │ └── BUILD.bazel │ └── turf/ │ └── BUILD.bazel └── ttcs-agent.cfg ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/build.yaml ================================================ # Bazel action to build & test specific targets. name: Bazel build on: push: branches: [main] pull_request: branches: [main] jobs: build: name: Bazel build and run local test runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v3 - name: Setup Bazel run: | sudo apt install -y apt-transport-https curl gnupg curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor >bazel-archive-keyring.gpg sudo mv bazel-archive-keyring.gpg /usr/share/keyrings echo "deb [arch=amd64 signed-by=/usr/share/keyrings/bazel-archive-keyring.gpg] https://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list sudo apt update sudo apt install -y bazel-5.2.0 sudo mv /usr/bin/bazel-5.2.0 /usr/bin/bazel - name: Build run: | bazel build //replica/... //proxy/... //client/... - name: Run local test run: ./scripts/local_test.sh --github ================================================ FILE: .gitignore ================================================ /.obj /.bin /bazel-* ================================================ FILE: .vscode/settings.json ================================================ { "C_Cpp.formatting": "clangFormat", "C_Cpp.clang_format_fallbackStyle": "{BasedOnStyle: Google, IncludeBlocks: Preserve, DerivePointerAlignment: false, PointerAlignment: Left}", "editor.formatOnSave": true, "files.associations": { "*.inc": "cpp", "cctype": "cpp", "clocale": "cpp", "cmath": "cpp", "cstdarg": "cpp", "cstddef": "cpp", "cstdio": "cpp", "cstdlib": "cpp", "cstring": "cpp", "ctime": "cpp", "cwchar": "cpp", "cwctype": "cpp", "array": "cpp", "atomic": "cpp", "bit": "cpp", "*.tcc": "cpp", "bitset": "cpp", "chrono": "cpp", "cinttypes": "cpp", "condition_variable": "cpp", "cstdint": "cpp", "deque": "cpp", "list": "cpp", "map": "cpp", "set": "cpp", "unordered_map": "cpp", "unordered_set": "cpp", "vector": "cpp", "exception": "cpp", "algorithm": "cpp", "functional": "cpp", "iterator": "cpp", "memory": "cpp", "memory_resource": "cpp", "numeric": "cpp", "optional": "cpp", "random": "cpp", "ratio": "cpp", "regex": "cpp", "string": "cpp", "string_view": "cpp", "system_error": "cpp", "tuple": "cpp", "type_traits": "cpp", "utility": "cpp", "fstream": "cpp", "initializer_list": "cpp", "iomanip": "cpp", "iosfwd": "cpp", "iostream": "cpp", "istream": "cpp", "limits": "cpp", "mutex": "cpp", "new": "cpp", "ostream": "cpp", "shared_mutex": "cpp", "sstream": "cpp", "stdexcept": "cpp", "streambuf": "cpp", "thread": "cpp", "typeinfo": "cpp", "csignal": "cpp", "any": "cpp", "cfenv": "cpp", "forward_list": "cpp", "future": "cpp", "scoped_allocator": "cpp", "typeindex": "cpp", "valarray": "cpp", "variant": "cpp", "hash_map": "cpp", "hash_set": "cpp", "*.ipp": "cpp", "csetjmp": "cpp", "strstream": "cpp", "charconv": "cpp", "codecvt": "cpp", "complex": "cpp", "source_location": "cpp", "rope": "cpp", "slist": "cpp" } } ================================================ FILE: README.md ================================================ # Nezha ---- Nezha (哪吒) is a legendary figure in Chinese mythology. Nezha has 3 heads and 6 arms, so he/she achieves much better fault tolerance than ordinary people :) PS: We have created [[an FAQ page](https://github.com/Steamgjk/Nezha/wiki)]. Please take a look for a better understanding of Nezha. ## Paper and Presentation Nezha: Deployable and High-Performance Consensus Using Synchronized Clocks (VLDB version) [[pdf](https://www.vldb.org/pvldb/vol16/p629-geng.pdf)] Nezha: Deployable and High-Performance Consensus Using Synchronized Clocks(Technical Report) [[pdf](docs/Nezha-technical-report.pdf)] An early presentation of Nezha was made at [Stanford Platform Lab Winter Review 2022](https://platformlab.stanford.edu/winter-review/platform-lab-winter-review-2022/) [[slides](https://platformlab.stanford.edu/wp-content/uploads/2022/03/Jinkun-Geng.pdf)] If you find our work helpful to your research or project, we would very appreciate it if you could **add a star** to our repo and/or **cite our papers**. The bibs for the papers are as below. ``` @article{vldb23-nezha, author = {Geng, Jinkun and Sivaraman, Anirudh and Prabhakar, Balaji and Rosenblum, Mendel}, title = {Nezha: Deployable and High-Performance Consensus Using Synchronized Clocks}, year = {2023}, journal = {Proceedings of the VLDB Endowment}, url = {https://www.vldb.org/pvldb/vol16/p629-geng.pdf}, publisher = {VLDB Endowment}, issn = {2150-8097}, volume = {16}, pages = {629-642}, numpages = {14} } @misc{nezha-tech, author = {Geng, Jinkun and Sivaraman, Anirudh and Prabhakar, Balaji and Rosenblum, Mendel}, title = {Nezha: Deployable and High-Performance Consensus Using Synchronized Clocks}, doi = {10.48550/ARXIV.2206.03285}, url = {https://arxiv.org/abs/2206.03285}, publisher = {arXiv}, year = {2022}, } ``` ## Clone Project ``` git clone --depth=1 https://github.com/Steamgjk/Nezha.git ``` ## File Structure The core part includes three modules (folders), i.e., - replica - proxy - client Each module is composed of three files: - a header file (e.g., replica.h), - a source implementation file (replica.cc), - a launching file (e.g., replica_run.cc). Each process reads an independent yaml file (e.g., nezha-replica-config-0.yaml) to get its full configuration, the sample configuration files are placed in the configs folder ## Install Bazel We use Bazel 5.2.0 for building Nezha. ``` # Install bazel 5.2.0 # Please follow the instructions at https://bazel.build/install/ubuntu#install-on-ubuntu, # or simply run the following commands sudo apt install -y apt-transport-https curl gnupg curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor >bazel-archive-keyring.gpg sudo mv bazel-archive-keyring.gpg /usr/share/keyrings echo "deb [arch=amd64 signed-by=/usr/share/keyrings/bazel-archive-keyring.gpg] https://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list sudo apt update sudo apt install -y bazel-5.2.0 sudo mv /usr/bin/bazel-5.2.0 /usr/bin/bazel bazel --version ``` ## Build Nezha with Bazel Since Bazel is becoming popular, we have migrated nezha from Makefile-based building system to the bazel building system. The bazel version in use is 5.2.0 ``` cd Nezha && bazel build //replica/... //client/... //proxy/... ``` After building the project successfully, the executable files will be generated in the folder named `bazel-bin` ## Single-Machine Tests Please refer to [the single-machine instructions](docs/demo.md) to run Nezha under various scenarios (view change, request commit, recovery from failure of replica). ## Multi-Machine Tests We use [scripts/launch.py](scripts/launch.py) to conduct distributed tests across multiple machines. After the tests have completed, [scripts/analysis.py](scripts/analysis.py) is used to analyze the results to generate performance numbers. The current scripts only support Google Cloud Platform (GCP). They require GCP credentials to create and delete VMs on GCP. ## Important Configuration Parameters ### Replica - ```replica-ips``` must include 2f+1 ips - ```replica-id``` starts from 0 to 2f - ```index-transfer-batch```, ```request-key-transfer-batch```, ```request-transfer-batch```. The values of the three batch parameters should be carefully chosen in order not to overflow the [maximum size of UDP packets](https://stackoverflow.com/questions/1098897/what-is-the-largest-safe-udp-packet-size-on-the-internet). ### Clients - We support two types of clients, i.e., open-loop clients and closed-loop clients. - Open-loop clients generate requests according to a Poisson process configured with a specific rate. - Closed-loop clients use a sliding window protocol to keep a fixed number of requests in flight at any given time, release a new request when an old one is completed. - ```is-openloop```: When this flag is true, --poission-rate becomes meaningful. - ```skew-factor``` and key-number decides the workload, which further affects the commutativity optimization ### Proxy - ```shard-num``` decides how many threads will be launched. 1 shard includes 1 forwarding thread to forward client requests to replicas and 1 replying thread to receive and replies from replicas and does quorum check - ```max-owd``` is used in the clamping function to estimate one-way delay, more details are described in Sec 4 [Adpative latency bound] of the paper. ## Performance Benchmark Refer to [our paper](https://arxiv.org/pdf/2206.03285.pdf) for the relevant performance stats. Compared with the experimental version, we have refactored the codebase with some higher-performance libraries (e.g. libev instead of libevent) and data structures (e.g., ConcurrentMap and ConcurrentQueue). Besides, we have also conducted further optimization with the pipeline. The performance will be somewhat better than the original version used in the paper. New benchmark data will be updated soon. ## Authors and Acknowledgment Nezha project is developed and maintained by [Jinkun Geng](https://steamgjk.github.io/) and his three supervisors, i.e., [Prof. Anirudh Sivaraman](https://cs.nyu.edu/~anirudh/), [Prof. Balaji Prabhakar](https://web.stanford.edu/~balaji/) and [Prof. Mendel Rosenblum](http://web.stanford.edu/~mendel/). We are fortunate to get the help from many researchers during the development of Nezha. Below we list and acknowledge them according to the timeline. [Dr. Shiyu Liu](https://web.stanford.edu/~shiyuliu/) and [Dr. Feiran Wang](https://www.linkedin.com/in/feiran-wang/) joined the discussion during the early design of Nezha. Feiran explained the details of CRaft and the related correctness properties. Shiyu explained the principles of Huygens and the other clock sync solutions. [Prof. Dan Ports](https://drkp.net/), [Prof. Jialin Li](https://www.comp.nus.edu.sg/~lijl/) and [Dr. Ellis Michael](https://ellismichael.com/) provided helpful discussion related to Speculative Paxos and NOPaxos. Dan also gave us the pointer to crash vector and diskless recovery. [Prof. Jinyang Li](http://www.news.cs.nyu.edu/~jinyang/) listened to our early presentation of Nezha, and gave some useful feedback. [Prof. Seo Jin Park](https://seojinpark.net/) discussed with us about the definition of linearizability and other correctness properties. Seo Jin also provided some explanation about CURP. [Prof. Zhaoguo Wang](https://ipads.se.sjtu.edu.cn/pub/members/zhaoguo_wang) shared with us his experience in testing Raft. The [Derecho team](https://derecho-project.github.io/) (Prof. Ken Birman, Dr. Weijia Song, Dr. Sagar Jha, Dr. Lorenzo Rosa, etc) offered technical support and discussion during our measurement of Derecho. The [ClockWork](https://www.clockwork.io/) Staff (Dr. Yilong Geng and Dr. Deepak Merugu) offered technical support in deploying Huygens. Dr. Deepak Merugu also gave suggestions on the coding-styles of Nezha codebase. Katie Gioioso provided feedback on Nezha design. Bhagirath Mehta participated in the single-machine test of Nezha. [Prof. Eugene Wu](http://www.cs.columbia.edu/~ewu/) provided suggestions on the revision of Nezha paper. [Prof. Aurojit Panda](https://cs.nyu.edu/~apanda/) discussed with us about Nezha's correctness during leader change. Aurojit reviewed our draft and offered some constructive suggestions on the revision. The [Raft community](https://groups.google.com/u/1/g/raft-dev/c/SmnAvZMufB0) offered much insightful discussion for us. Many community members discussed with us and helped to justify our design decisions about Nezha. ## License Please refer to [license.md](license.md) ## Future Plan (1) Conduct more functionality and performance tests to make Nezha more robust and optimized (3) Replace [the etcd backend for Kubenetes](https://learnk8s.io/etcd-kubernetes) to boost the performance of Kubenetes. ================================================ FILE: WORKSPACE ================================================ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository", "new_git_repository") http_archive( name = "rules_proto", sha256 = "e017528fd1c91c5a33f15493e3a398181a9e821a804eb7ff5acdd1d2d6c2b18d", strip_prefix = "rules_proto-4.0.0-3.20.0", urls = [ "https://github.com/bazelbuild/rules_proto/archive/refs/tags/4.0.0-3.20.0.tar.gz", ], ) load("@rules_proto//proto:repositories.bzl", "rules_proto_dependencies", "rules_proto_toolchains") rules_proto_dependencies() rules_proto_toolchains() http_archive( name = "com_github_grpc_grpc", sha256 = "9f387689b7fdf6c003fd90ef55853107f89a2121792146770df5486f0199f400", urls = [ "https://github.com/grpc/grpc/archive/refs/tags/v1.42.0.zip", ], strip_prefix = "grpc-1.42.0", ) load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps") grpc_deps() load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps") grpc_extra_deps() http_archive( name = "googleapi", sha256 = "3ff2365822fb573cb1779ada5c2ac7899269cacd0836aef95ffe9d95779031f2", url = "https://github.com/googleapis/googleapis/archive/refs/tags/common-protos-1_3_1.zip", strip_prefix = "googleapis-common-protos-1_3_1/", build_file="@//external:googleapi.BUILD", ) http_archive( name = "etcd", sha256 = "580ce584dc7628efebb57f8c8240674918d334ad21e33186bbc5f6348f465bc1", url = "https://github.com/etcd-io/etcd/archive/refs/tags/v3.5.0.zip", strip_prefix = "etcd-3.5.0/", build_file="@//external:etcd.BUILD", ) http_archive( name = "gogoprotobuf", sha256 = "f89f8241af909ce3226562d135c25b28e656ae173337b3e58ede917aa26e1e3c", url = "https://github.com/gogo/protobuf/archive/refs/tags/v1.3.2.zip", strip_prefix = "protobuf-1.3.2/", build_file="@//external:gogoprotobuf.BUILD", ) git_repository( name = "com_github_jbeder_yaml_cpp", commit = "fcbb8193b94921e058be7b563aea053531e5b2d9", # 19-Aug-2023 remote = "https://github.com/jbeder/yaml-cpp.git", shallow_since = "1692473776 -0400", ) new_git_repository( name = "com_github_cameron314_concurrentqueue", build_file = "//third_party/concurrentqueue:BUILD.bazel", commit = "6dd38b8a1dbaa7863aa907045f32308a56a6ff5d", shallow_since = "1686439287 -0400", remote = "https://github.com/cameron314/concurrentqueue.git", ) new_git_repository( name = "com_github_preshing_junction", commit = "5ad3be7ce1d3f16b9f7ed6065bbfeacd2d629a08", shallow_since = "1518982100 -0500", patches = ["//third_party/junction:junction.patch"], patch_args = ["-p1"], build_file = "//third_party/junction:BUILD.bazel", remote = "https://github.com/preshing/junction", ) new_git_repository( name = "com_github_preshing_turf", commit = "9ae0d4b984fa95ed5f823274b39c87ee742f6650", shallow_since = "1484317994 -0500" , build_file = "//third_party/turf:BUILD.bazel", remote = "https://github.com/preshing/turf", ) new_git_repository( name = "com_github_enki_libev", commit = "93823e6ca699df195a6c7b8bfa6006ec40ee0003", shallow_since = "1463172876 -0700", build_file = "//third_party/libev:BUILD.bazel", remote = "https://github.com/enki/libev.git", ) # Google gflags. git_repository( name = "com_github_gflags_gflags", commit = "e171aa2d15ed9eb17054558e0b3a6a413bb01067", # 11-Nov-2018 remote = "https://github.com/gflags/gflags.git", shallow_since = "1541971260 +0000", ) # Google glog. new_git_repository( name = "com_github_google_glog", build_file = "//third_party/glog:BUILD.glog", commit = "ba8a9f6952d04d1403b97df24e6836227751454e", # 7-May-2019 remote = "https://github.com/google/glog.git", # Shallow since doesn't work here for some weird reason. See # https://github.com/bazelbuild/bazel/issues/10292 # shallow_since = "1557212520 +0000", ) # Google protobuf. git_repository( name = "com_google_protobuf", commit = "21027a27c4c2ec1000859ccbcfff46d83b16e1ed", # 21-Apr-2022, v3.20.1 remote = "https://github.com/protocolbuffers/protobuf", shallow_since = "1650589240 +0000", ) http_archive( name = "rules_foreign_cc", sha256 = "2a8000ce03dd9bb324bc9bb7f1f5d01debac406611f4d9fedd385192718804f0", strip_prefix = "rules_foreign_cc-60813d57a0e99be1a009c1a0e9627cdbe81fcd19", url = "https://github.com/bazelbuild/rules_foreign_cc/archive/60813d57a0e99be1a009c1a0e9627cdbe81fcd19.tar.gz", ) load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies") rules_foreign_cc_dependencies() http_archive( name = "openssl", build_file = "//third_party/openssl:BUILD.bazel", sha256 = "23011a5cc78e53d0dc98dfa608c51e72bcd350aa57df74c5d5574ba4ffb62e74", strip_prefix = "openssl-OpenSSL_1_1_1d", urls = ["https://github.com/openssl/openssl/archive/OpenSSL_1_1_1d.tar.gz"], ) http_archive( name = "com_github_nelhage_rules_boost", url = "https://github.com/nelhage/rules_boost/archive/96e9b631f104b43a53c21c87b01ac538ad6f3b48.tar.gz", strip_prefix = "rules_boost-96e9b631f104b43a53c21c87b01ac538ad6f3b48", sha256 = "5ea00abc70cdf396a23fb53201db19ebce2837d28887a08544429d27783309ed", ) load("@com_github_nelhage_rules_boost//:boost/boost.bzl", "boost_deps") boost_deps() ================================================ FILE: client/BUILD ================================================ load("@rules_proto//proto:defs.bzl", "proto_library") cc_library( name = "client_config", hdrs = ["client_config.h"], deps = [ "@com_github_jbeder_yaml_cpp//:yaml-cpp", ], ) cc_library( name = "client_class", srcs = ["client.cc"], hdrs = ["client.h"], deps = [ "//proto:nezha_cc_proto", "//lib:zipfian", "//lib:utils", ":client_config", ], ) cc_binary( name = "nezha_client", srcs = ["client_run.cc"], deps = [ ":client_class", ], ) ================================================ FILE: client/client.cc ================================================ #include "client/client.h" namespace nezha { Client::Client(const std::string& configFile) { hop3s.reserve(500000); hop4s.reserve(500000); totals.reserve(500000); LOG(INFO) << "Loading config information from " << configFile; std::string error = clientConfig_.parseConfig(configFile); if (error != "") { LOG(ERROR) << "Error loading client config: " << error << " Exiting."; exit(1); } clientId_ = clientConfig_.clientId; LOG(INFO) << "clientId=" << clientId_; std::string clientIP = clientConfig_.clientIp; LOG(INFO) << "clientIP=" << clientIP; int requestPort = clientConfig_.requestPort; LOG(INFO) << "requestPort=" << requestPort; LOG(INFO) << "endPointType=" << clientConfig_.endpointType; requestEP_ = CreateEndpoint(clientConfig_.endpointType, clientIP, requestPort, true); replyHandler_ = CreateMsgHandler( clientConfig_.endpointType, [](MessageHeader* msgHdr, char* msgBuffer, Address* sender, void* ctx) { ((Client*)ctx)->ReceiveReply(msgHdr, msgBuffer, sender); }, this); monitorTimer_ = new Timer( [](void* ctx, void* receiverEP) { // LOG(INFO) << "Monitor running " << ((Client*)ctx)->running_; if (((Client*)ctx)->running_ == false) { ((Endpoint*)receiverEP)->LoopBreak(); } }, 10 /*Checks the status every 10ms*/, this); /** Fetch the addreses of all proxies and organize them as a two-dimensional * vector */ proxyAddrs_.resize(clientConfig_.proxyIps.size()); for (uint32_t i = 0; i < proxyAddrs_.size(); i++) { proxyAddrs_[i].resize(clientConfig_.proxyShardNum); for (uint32_t j = 0; j < proxyAddrs_[i].size(); j++) { proxyAddrs_[i][j] = new Address(clientConfig_.proxyIps[i], clientConfig_.proxyRequestPortBase + j); } } /** If the client is a open-loop client, generate the poission trace for the * client */ if (clientConfig_.isOpenLoop) { poissonRate_ = clientConfig_.poissonRate; LOG(INFO) << "OpenLoop Client rate=" << poissonRate_; poissonTrace_.resize(1000, 0); std::default_random_engine generator(clientId_); // clientId as the seed std::poisson_distribution distribution(poissonRate_); for (int i = 0; i < 1000; i++) { int reqNum = distribution(generator); if (reqNum < 0) { poissonTrace_[i] = 0; } else { poissonTrace_[i] = reqNum; } } } /** Generate zipfian workload */ LOG(INFO) << "keyNum=" << clientConfig_.keyNum << "\tskewFactor=" << clientConfig_.skewFactor << "\twriteRatio=" << clientConfig_.writeRatio; zipfianKeys_.resize(1000000, 0); retryTimeoutUs_ = clientConfig_.requestRetryTimeUs; if (clientConfig_.keyNum > 1) { std::default_random_engine generator(clientId_); // clientId as the seed zipfian_int_distribution zipfianDistribution( 0, clientConfig_.keyNum - 1, clientConfig_.skewFactor); for (uint32_t i = 0; i < zipfianKeys_.size(); i++) { zipfianKeys_[i] = zipfianDistribution(generator); } } /** Initialize */ committedReqId_ = 0; reclaimedReqId_ = 0; nextReqId_ = 1; retryNumber_ = 0; committedNum_ = 0; fastCommitNum_ = 0; fastWriteNum_ = 0; } void Client::Run() { running_ = true; LaunchThreads(); for (auto& kv : threadPool_) { LOG(INFO) << "Join " << kv.first; kv.second->join(); LOG(INFO) << "Join Complete " << kv.first; } LOG(INFO) << "Run Terminated "; } void Client::LaunchThreads() { threadPool_["LogTd"] = new std::thread(&Client::LogTd, this); threadPool_["ProcessReplyTd"] = new std::thread(&Client::ProcessReplyTd, this); if (clientConfig_.isOpenLoop) { LOG(INFO) << "OpenLoop Client"; threadPool_["OpenLoopSubmissionTd"] = new std::thread(&Client::OpenLoopSubmissionTd, this); } else { LOG(INFO) << "ClosedLoop Client"; threadPool_["CloseLoopSubmissionTd"] = new std::thread(&Client::CloseLoopSubmissionTd, this); } } void Client::ProcessReplyTd() { /** Register the message handler and timer. Then this thread will run in an * event-driven mode, i.e, when message comes, it calls the registered message * handler */ requestEP_->RegisterMsgHandler(replyHandler_); requestEP_->RegisterTimer(monitorTimer_); LOG(INFO) << "Loop Run "; requestEP_->LoopRun(); LOG(INFO) << "Loop Run Exit "; } void Client::ReceiveReply(MessageHeader* msgHdr, char* msgBuffer, Address* sender) { if (msgHdr->msgLen < 0) { return; } Reply reply; if (msgHdr->msgType == MessageType::COMMIT_REPLY && reply.ParseFromArray(msgBuffer, msgHdr->msgLen)) { committedNum_++; if (reply.replytype() == MessageType::FAST_REPLY) { fastCommitNum_++; if (reply.iswrite()) { fastWriteNum_++; } } // if (committedNum_ % 100000 == 0) { // LOG(INFO) << "commitNum=" << committedNum_ // << "\tfastWriteNum_=" << fastWriteNum_ // << "\tFastCommitNum=" << fastCommitNum_ << // "\tWriteRatioCommit=" // << (fastWriteNum_ * 100.0 / fastCommitNum_) // << "\t fastRatio=" << (fastCommitNum_ * 100.0 / // committedNum_); // } if (committedReqId_ < reply.reqid()) { committedReqId_ = reply.reqid(); // // LOG(INFO) << "committedReqId_=" << committedReqId_; // uint64_t st = outstandingRequestSendTime_.get(reply.reqid()); // uint64_t et = GetMicrosecondTimestamp(); // ls.push_back((et - st)); // if (ls.size() >= 1000) { // for (uint32_t i = 0; i < 1000; i++) { // printf("%u\t", ls[i]); // if (i % 20 == 0) { // printf("\n"); // } // } // exit(0); // } } uint64_t sendTime = outstandingRequestSendTime_.get(reply.reqid()); if (sendTime > 0) { /** The corresponding request has not been committed, because it is still * in outstandingRequestSendTime_, so we wan to mark it as committed, * i.e., erase from outstandingRequestSendTime_ */ /** * Generate log information and pass to logQu_, which will be handled by * LogTd * */ uint64_t recvTime = GetMicrosecondTimestamp(); LogInfo* log = new LogInfo(); lastCommittedReqId_ = reply.reqid(); *log = {reply.reqid(), sendTime, recvTime, reply.replytype()}; outstandingRequestSendTime_.erase(reply.reqid()); logQu_.enqueue(log); } } } void Client::OpenLoopSubmissionTd() { int roundRobinIdx = 0; uint64_t startTime = GetMicrosecondTimestamp(); uint64_t endTime = startTime + clientConfig_.durationSec * 1000000; srandom(clientId_); endTime += 10 * 1000ul * 1000ul; LOG(INFO) << "Expected to end at " << endTime; // Poisson rate is ``10ms as one unit'' for (uint32_t i = 0; i < clientConfig_.durationSec * 100; i++) { if (!running_) { return; } if (GetMicrosecondTimestamp() >= endTime) { // Client has executed long enough, should terminate LOG(INFO) << "Terminating soon..."; running_ = false; return; } uint32_t reqNum = poissonTrace_[i % poissonTrace_.size()]; if (reqNum <= 0) { usleep(10000); continue; } uint32_t intval = 10000 / reqNum; uint64_t startTime = GetMicrosecondTimestamp(); for (uint32_t j = 0; j < reqNum; j++) { while (GetMicrosecondTimestamp() < startTime + j * intval) { } // Send the request uint32_t mapIdx = roundRobinIdx % (proxyAddrs_.size() * proxyAddrs_[0].size()); Request* request = NULL; if (retryQu_.try_dequeue(request)) { // Retry this request Address* roundRobinAddr = proxyAddrs_[mapIdx % proxyAddrs_.size()] [mapIdx / proxyAddrs_.size()]; // LOG(INFO) << "Resend " << request->reqid() << "to " // << mapIdx % proxyAddrs_.size() << "\t" // << mapIdx / proxyAddrs_.size(); requestEP_->SendMsgTo(*roundRobinAddr, *request, MessageType::CLIENT_REQUEST); outstandingRequestSendTime_.assign(request->reqid(), GetMicrosecondTimestamp()); roundRobinIdx++; } else { // submit new requests request = new Request(); request->set_clientid(clientId_); request->set_reqid(nextReqId_); if (random() % 100 < 100 * writeRatio_) { request->set_iswrite(true); } else { request->set_iswrite(false); } request->set_key(zipfianKeys_[nextReqId_ % zipfianKeys_.size()]); // // if (nextReqId_ % 10 == 1 && clientId_ <= 10) { // if (clientId_ <= 12) { // if (nextReqId_ % 2 == 1) // request->set_iswrite(true); // else // request->set_iswrite(false); // // request->set_iswrite(true); // // LOG(INFO) << "One Write " << request->key() // // << " reqId=" << request->reqid(); // } else { // exit(0); // } // request->set_key(nextReqId_ % 100000 + 100000 * (clientId_ - 1)); Address* roundRobinAddr = proxyAddrs_[mapIdx % proxyAddrs_.size()] [mapIdx / proxyAddrs_.size()]; // LOG(INFO) << "Sed " << request->reqid() << "to " // << mapIdx % proxyAddrs_.size() << "\t" // << mapIdx / proxyAddrs_.size(); requestEP_->SendMsgTo(*roundRobinAddr, *request, MessageType::CLIENT_REQUEST); outstandingRequests_.assign(request->reqid(), request); outstandingRequestSendTime_.assign(request->reqid(), GetMicrosecondTimestamp()); nextReqId_++; roundRobinIdx++; } } } LOG(INFO) << "Terminating soon... after " << (endTime - GetMicrosecondTimestamp()) * 1e-6 << " seconds"; while (GetMicrosecondTimestamp() < endTime) { // Client has executed long enough, should terminate usleep(1000); } running_ = false; } void Client::CloseLoopSubmissionTd() { int roundRobinIdx = 0; uint64_t startTime = GetMicrosecondTimestamp(); uint64_t endTime = startTime + clientConfig_.durationSec * 1000000; endTime += 10 * 1000ul * 1000ul; LOG(INFO) << "Expected to end at " << endTime; srand(clientId_); while (running_) { if (GetMicrosecondTimestamp() >= endTime) { // Client has executed long enough, should terminate LOG(INFO) << "Terminating soon..."; running_ = false; return; } Request* request = NULL; uint32_t mapIdx = roundRobinIdx % (proxyAddrs_.size() * proxyAddrs_[0].size()); if (nextReqId_ == committedReqId_ + 1) { // submit new request request = new Request(); request->set_clientid(clientId_); request->set_reqid(nextReqId_); if (random() % 100 < 100 * writeRatio_) { request->set_iswrite(true); } else { request->set_iswrite(false); } request->set_key(zipfianKeys_[nextReqId_ % zipfianKeys_.size()]); Address* roundRobinAddr = proxyAddrs_[mapIdx % proxyAddrs_.size()][mapIdx / proxyAddrs_.size()]; requestEP_->SendMsgTo(*roundRobinAddr, *request, MessageType::CLIENT_REQUEST); outstandingRequests_.assign(request->reqid(), request); outstandingRequestSendTime_.assign(request->reqid(), GetMicrosecondTimestamp()); nextReqId_++; roundRobinIdx++; } else { if (retryQu_.try_dequeue(request)) { // have some requests to retry Address* roundRobinAddr = proxyAddrs_[mapIdx % proxyAddrs_.size()] [mapIdx / proxyAddrs_.size()]; requestEP_->SendMsgTo(*roundRobinAddr, *request, MessageType::CLIENT_REQUEST); outstandingRequestSendTime_.assign(request->reqid(), GetMicrosecondTimestamp()); roundRobinIdx++; } } } LOG(INFO) << "Terminating soon... after " << (endTime - GetMicrosecondTimestamp()) * 1e-6 << " seconds"; while (GetMicrosecondTimestamp() < endTime) { // Client has executed long enough, should terminate usleep(1000); } running_ = false; } void Client::LogTd() { LogInfo* log = NULL; uint64_t startTime, endTime; uint32_t lastSubmitteddReqId = 0; uint32_t lastCountCommitedReq = 0; uint32_t latencySample = 0; std::ofstream ofs("Client-Stats-" + std::to_string(clientId_)); ofs << "ReqId,SendTime,CommitTime,CommitType" << std::endl; startTime = GetMicrosecondTimestamp(); while (running_) { endTime = GetMicrosecondTimestamp(); if (endTime - startTime >= 5000000) { float duration = (endTime - startTime) * 1e-6; uint32_t submittedReqNum = nextReqId_ - 1 - lastSubmitteddReqId; uint32_t committedReqNum = committedNum_ - lastCountCommitedReq; float submissionRate = submittedReqNum / duration; float commitRate = committedReqNum / duration; lastSubmitteddReqId = nextReqId_ - 1; lastCountCommitedReq = committedNum_; startTime = endTime; LOG(INFO) << "endTime=" << endTime << "\t" << "committedNum_ = " << committedNum_ << "\t" << "logQuLen =" << logQu_.size_approx() << "\t" << "committedReqId_=" << committedReqId_ << "\t" << "nextReqId_=" << nextReqId_ << "\t" << "lastCommittedReqId_=" << lastCommittedReqId_ << "\t" << "submissionRate=" << submissionRate << " req/sec\t" << "commitRate=" << commitRate << " req/sec" << "\t" << "FastCommitRatio=" << fastCommitNum_ * 100.0 / committedNum_ << "\t" << "latency(Sample)=" << latencySample << " us" << "\t" << "retryNum=" << retryNumber_; ofs.flush(); } if (logQu_.try_dequeue(log)) { // LOG(INFO) << "committedReqId_=" << committedReqId_ << "\t" << "reqId=" // << log->reqId; while (committedReqId_ + 1 <= log->reqId) { if (outstandingRequestSendTime_.get(committedReqId_ + 1) == 0) { // this reqId has also been committed (i.e. cannot find its footprint) // advance committedReqId; committedReqId_++; } else { break; } } latencySample = log->commitTime - log->sendTime; // log stats ofs << log->toString() << std::endl; delete log; } // // Check whether any requests need retry // for (uint32_t reqId = committedReqId_ + 1; reqId < nextReqId_; reqId++) { // uint64_t sendTime = outstandingRequestSendTime_.get(reqId); // if (sendTime > 0) { // // Find it // if (GetMicrosecondTimestamp() - sendTime > retryTimeoutus_) { // // timeout, should retry // Request* request = outstandingRequests_.get(reqId); // LOG(INFO) << "Timeout Retry " << request->reqid(); // outstandingRequestSendTime_.erase(reqId); // retryQu_.enqueue(request); // retryNumber_++; // } // } // } while (reclaimedReqId_ + 1000 < committedReqId_) { // do not reclaim request too aggressive // If we reclaim too aggressive, there can be some edge case of dangling // request pointer Request* request = outstandingRequests_.get(reclaimedReqId_); if (request) { outstandingRequests_.erase(request->reqid()); delete request; } reclaimedReqId_++; } } LOG(INFO) << "The runtime have been terminated, we still need to dump " << logQu_.size_approx() << " Logs before exit"; uint32_t cnt = 0; while (logQu_.try_dequeue(log)) { // log stats ofs << log->toString() << std::endl; delete log; cnt++; if (cnt % 10000 == 0) { LOG(INFO) << "Remaining Log Number " << logQu_.size_approx(); ofs.flush(); } } ofs.flush(); LOG(INFO) << "Dump Finished"; } void Client::Terminate() { LOG(INFO) << "Terminating..."; running_ = false; } Client::~Client() { for (auto& kv : threadPool_) { delete kv.second; } while (reclaimedReqId_ <= nextReqId_) { Request* request = outstandingRequests_.get(reclaimedReqId_); if (request) { outstandingRequests_.erase(request->reqid()); delete request; } reclaimedReqId_++; } } } // namespace nezha ================================================ FILE: client/client.h ================================================ #include #include #include #include "client_config.h" #include "lib/utils.h" #include "lib/zipfian.h" #include "proto/nezha_proto.pb.h" namespace nezha { using namespace nezha::proto; /** LogInfo is used to dump some performance stats, which can be extended to * include more metrics */ struct LogInfo { uint32_t reqId; uint64_t sendTime; uint64_t commitTime; uint32_t commitType; std::string toString() { std::string ret = (std::to_string(reqId) + "," + std::to_string(sendTime) + "," + std::to_string(commitTime) + "," + std::to_string(commitType)); return ret; } }; /** * Refer to client_run.cc, the runnable program only needs to instantiate a * client object with a configuration file. Then it calls Run() method to run * and calls Terminate() method to stop */ class Client { private: /** All the configuration parameters for client are included in * clientConfig_*/ ClientConfig clientConfig_; /** Each thread is given a unique name (key) and stored in the pool */ std::map threadPool_; /** The endpoint uses to submit request to proxies */ Endpoint* requestEP_; /** The message handler used to handle replies (from proxies) */ struct MessageHandler* replyHandler_; /** The timer periodically monitor the status of the client, and break the * blocking endpoint when the client is about to terminate */ struct Timer* monitorTimer_; /** Flag to Run/Terminate threads */ std::atomic running_; /** Each client is assigned with a unqiue id */ int clientId_; /** Open-Loop submission related: the client's submission rate follows a * poisson distribution. We use 10ms as the basic interval and generate random * numbers with reference to poissonRate_, stored in poissonTrace_. Then the * open-loop clients submit poissonTrace_[i] requests in the ith interval. * * Regarding the definition of open-loop and closed-loop submission, refer to * ``evaluation method`` para of Sec 7.1 in our paper * */ int poissonRate_; /** The next requestId to be submitted */ std::atomic nextReqId_; /** Requests whose requestId less or equal to committedReqId_ have been * committed */ std::atomic committedReqId_; /** Requests whose requestId less or equal to reclaimedReqId_ have been * reclaimed (memory freed) */ std::atomic reclaimedReqId_; std::vector poissonTrace_; /** To communicate between OpenLoopSubmissionTd/CloseLoopSubmissionTd and * LogTd The LogTd monitors the outstanding requests (i.e. which have been * submitted but have not been committed). If some request has not been * committed after a certain time, the LogTd will enqueue the request to * retryQu, so that the OpenLoopSubmissionTd/CloseLoopSubmissionTd will * retry them */ ConcurrentQueue retryQu_; /** The addresses of proxies. Since we can have multiple proxies, and each * proxies can have multiple shards, we use a two-dimensional vector to store * the addresses, i.e., proxyAddrs[i][j] indicates the address of the jth * shard of the ith proxy */ std::vector> proxyAddrs_; /** To test commutativity, we generate different zipfian workloads and write * ratios, i.e., we generate random numbers following the zipfian * distribution. These random numbers are stored in zipfianKeys_ and serve as * the keys that will be written/read by requests */ std::vector zipfianKeys_; float writeRatio_; /** Those requests which have been submitted but not yet committed (key is the * requestId)*/ ConcurrentMap outstandingRequests_; /** Record the send time of the requests, together with retryTimeoutus_, to * decide whether the request needes to be retried*/ ConcurrentMap outstandingRequestSendTime_; /** Used by LogTd to monitor outstanding reuqests. If they cannot be committed * within retryTimeoutUs_ (measured in macro-seconds), they should be retried * **/ uint32_t retryTimeoutUs_; /** To communicate between ProcessReplyTd and LogTd */ ConcurrentQueue logQu_; /** Performance counters, to show how many requests are retried/committed */ uint32_t retryNumber_; uint32_t committedNum_; uint32_t fastCommitNum_; uint32_t fastWriteNum_; /** Stats */ std::vector hop3s; std::vector hop4s; std::vector totals; /** Launch all the threads, only called once during the lifetime of the * client*/ void LaunchThreads(); /** Functions whose names are ended with ``Td`` will be used to instantiate * threads. * * For the client, there are mainly three worker threads running: * * (1) OpenLoopSubmissionTd/CloseLoopSubmissionTd submits requests. A client * can be either open-loop client or closed-loop client, but cannot be both. * * (2) ProcessReplyTd receives and processes the reply messages, and handle * the log information to LogTd * * (3) LogTd dumps logs and also monitors the oustanding requests. If the * requests have not been committed after a certain time (retryTimeoutus_), * then LogTd will ask OpenLoopSubmissionTd/CloseLoopSubmissionTd to resubmit * this reuqest to proxies * */ void ProcessReplyTd(); void OpenLoopSubmissionTd(); void CloseLoopSubmissionTd(); void LogTd(); /** The message handler to handle messages from proxies. The function is used * to instantiate a replyHandler_ and registered to requestEP_ */ void ReceiveReply(MessageHeader* msgHdr, char* msgBuffer, Address* sender); public: /** Client accepts a config file, which contains all the necessary information * to instantiate the object, then it can call Run method * */ Client(const std::string& configFile = "../configs/nezha-client-config.yaml"); void Run(); void Terminate(); ~Client(); /** For debug */ uint64_t lastCommittedReqId_; std::vector ls; }; } // namespace nezha ================================================ FILE: client/client_config.h ================================================ #include #include #include #include #include struct ClientConfig { int clientId; std::string clientIp; int endpointType; int requestPort; uint32_t proxyMaxOwd; int proxyReplyPortBase; bool isOpenLoop; int poissonRate; uint32_t durationSec; int keyNum; double skewFactor; double writeRatio; int requestRetryTimeUs; int proxyRequestPortBase; std::vector proxyIps; int proxyShardNum; // Parses yaml file configFilename and fills in fields of ProxyConfig // accordingly. Returns an error message or "" if there are no errors. std::string parseConfig(std::string configFilename) { YAML::Node config; try { config = YAML::LoadFile(configFilename); } catch (const YAML::BadFile& e) { return "Error loading config file:" + e.msg + "."; } LOG(INFO) << "Using config:\n " << config; std::string key; // Keep track of current key for better error messages try { key = "client-id"; clientId = config[key].as(); key = "client-ip"; clientIp = config[key].as(); key = "endpoint-type"; endpointType = config[key].as(); key = "request-port"; requestPort = config[key].as(); key = "is-openloop"; isOpenLoop = config[key].as(); key = "poisson-rate"; poissonRate = config[key].as(); key = "duration-sec"; durationSec = config[key].as(); key = "key-num"; keyNum = config[key].as(); key = "skew-factor"; skewFactor = config[key].as(); key = "write-ratio"; writeRatio = config[key].as(); key = "request-retry-time-us"; requestRetryTimeUs = config[key].as(); key = "proxy-ips"; for (uint32_t i = 0; i < config[key].size(); i++) { proxyIps.push_back(config[key][i].as()); } key = "proxy-shards"; proxyShardNum = config[key].as(); key = "proxy-request-port-base"; proxyRequestPortBase = config[key].as(); return ""; } catch (const YAML::BadConversion& e) { if (config[key]) { return "Error parsing config field " + key + ": " + e.msg + "."; } else { return "Error parsing config field " + key + ": key not found."; } } catch (const std::exception& e) { return "Error parsing config field " + key + ": " + e.what() + "."; } } }; ================================================ FILE: client/client_run.cc ================================================ #include "client/client.h" DEFINE_string(config, "nezhav2/config/nezha-client-config-0.yaml", "The config file for the client"); nezha::Client* client = NULL; void Terminate(int para) { client->Terminate(); } int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, true); google::InitGoogleLogging(argv[0]); FLAGS_logtostderr = 1; signal(SIGINT, Terminate); client = new nezha::Client(FLAGS_config); client->Run(); delete client; } ================================================ FILE: configs/dist/nezha-client-config.yaml ================================================ --- print-config: true proxy-info: proxy-ips: - "10.128.2.13" proxy-shards: 1 request-port-base: 32000 client-info: client-id: 1 client-ip: "10.128.2.14" request-port: 32912 is-openloop: true poisson-rate: 10 # it means the client sends x reqs/10ms on average duration-sec: 60 # it means the duration of the client runs (second) key-num: 1000000 # when key-num is 1, it means there is no commutativity optimization skew-factor: 0.5 # 0-0.99 The higher the zipfian factor is, the more skewed the workload is request-retry-time-us: 10000 # After the request is submitted, if we cannot get the response after such long time, then we will retry ================================================ FILE: configs/dist/nezha-proxy-config.yaml ================================================ --- print-config: true # Replica Info replica-info: replica-ips: - "10.128.2.10" - "10.128.2.11" - "10.128.2.12" receiver-shards: 1 # The number of threads to receive threads receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index initial-owd: 80 # The initial one-way delay (us) between replicas and proxies # Proxy Info proxy-info: proxy-id: 1 proxy-ip: "10.128.2.13" shard-num: 1 request-port-base: 32000 reply-port-base: 33000 ================================================ FILE: configs/dist/nezha-replica-config-0.yaml ================================================ --- print-config: true replica-ips: - "10.128.2.10" - "10.128.2.11" - "10.128.2.12" replica-id: 0 receiver-shards: 1 # The number of threads to receive threads process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance reply-shards: 1 # The number of threads to send replies (both fast/slow replies) index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower master-port: 34333 # This port is mainly used to send/receive other messages monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal main-loop-period-ms: 20 heartbeat-threshold-ms: 500 index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests) view-change-period-ms: 10 state-transfer-period-ms: 10 state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange index-transfer-max-batch: 30 request-transfer-max-batch: 5 crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies recovery-request-period-ms: 10 sync-report-period-ms: 10 key-num: 1000000 # when commutativity is enabled for test, we need the specified key number to detect commutativity owd-estimation-window: 1000 # we use a sliding window to get moving median of one-way delays, here is the length of the window ================================================ FILE: configs/dist/nezha-replica-config-1.yaml ================================================ --- print-config: true replica-ips: - "10.128.2.10" - "10.128.2.11" - "10.128.2.12" replica-id: 1 receiver-shards: 1 # The number of threads to receive threads process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance reply-shards: 1 # The number of threads to send replies (both fast/slow replies) index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower master-port: 34333 # This port is mainly used to send/receive other messages monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal main-loop-period-ms: 20 heartbeat-threshold-ms: 500 index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests) view-change-period-ms: 10 state-transfer-period-ms: 10 state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange index-transfer-max-batch: 30 request-transfer-max-batch: 5 crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies recovery-request-period-ms: 10 sync-report-period-ms: 10 key-num: 1000000 # when commutativity is enabled for test, we need the specified key number to detect commutativity owd-estimation-window: 1000 # we use a sliding window to get moving median of one-way delays, here is the length of the window ================================================ FILE: configs/dist/nezha-replica-config-2.yaml ================================================ --- print-config: true replica-ips: - "10.128.2.10" - "10.128.2.11" - "10.128.2.12" replica-id: 2 receiver-shards: 1 # The number of threads to receive threads process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance reply-shards: 1 # The number of threads to send replies (both fast/slow replies) index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower master-port: 34333 # This port is mainly used to send/receive other messages monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal main-loop-period-ms: 20 heartbeat-threshold-ms: 500 index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests) view-change-period-ms: 10 state-transfer-period-ms: 10 state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange index-transfer-max-batch: 30 request-transfer-max-batch: 5 crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies recovery-request-period-ms: 10 sync-report-period-ms: 10 key-num: 1000000 # when commutativity is enabled for test, we need the specified key number to detect commutativity owd-estimation-window: 1000 # we use a sliding window to get moving median of one-way delays, here is the length of the window ================================================ FILE: configs/dist/nezha-replica-config.yaml ================================================ --- print-config: true replica-ips: - "10.128.2.10" - "10.128.2.11" - "10.128.2.12" replica-id: 0 receiver-shards: 1 # The number of threads to receive threads process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance reply-shards: 1 # The number of threads to send replies (both fast/slow replies) index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower master-port: 34333 # This port is mainly used to send/receive other messages monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal main-loop-period-ms: 20 heartbeat-threshold-ms: 500 index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests) view-change-period-ms: 10 state-transfer-period-ms: 10 state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange index-transfer-max-batch: 30 request-transfer-max-batch: 5 crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies recovery-request-period-ms: 10 sync-report-period-ms: 10 key-num: 1000000 # when commutativity is enabled for test, we need the specified key number to detect commutativity owd-estimation-window: 1000 # we use a sliding window to get moving median of one-way delays, here is the length of the window ================================================ FILE: configs/local/nezha-client-config.yaml ================================================ --- client-id: 1 client-ip: "127.0.0.5" endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon] request-port: 32912 is-openloop: true poisson-rate: 1 # it means the client sends x reqs/10ms on average (should be larger than 10, otherwise, the submission rate is not accurate) duration-sec: 60 # it means the duration of the client runs (second) key-num: 1000000 # when key-num is 1, it means there is no commutativity optimization skew-factor: 0.5 # 0-0.99 The higher the zipfian factor is, the more skewed the workload is write-ratio: 0.5 # 0-1, the ratio of write requests request-retry-time-us: 100000 # After the request is submitted, if we cannot get the response after such long time, then we will retry # proxy info proxy-ips: - "127.0.0.4" proxy-shards: 1 proxy-request-port-base: 32000 ================================================ FILE: configs/local/nezha-proxy-config.yaml ================================================ --- # Proxy Info proxy-endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon] proxy-id: 1 proxy-ip: "127.0.0.4" proxy-shard-num: 1 proxy-max-owd: 200 proxy-request-port-base: 32000 proxy-reply-port-base: 33000 # Replica Info replica-ips: - "127.0.0.1" - "127.0.0.2" - "127.0.0.3" replica-receiver-shards: 1 # The number of threads to receive threads replica-receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index replica-initial-owd: 80 # The initial one-way delay (us) between replicas and proxies ================================================ FILE: configs/local/nezha-replica-config-0.yaml ================================================ --- print-config: true replica-ips: - "127.0.0.1" - "127.0.0.2" - "127.0.0.3" endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon] replica-id: 0 receiver-shards: 1 # The number of threads to receive threads record-shards: 1 # The number of threads to record requests in the global concurrent map track-shards: 1 # The number of threads to record synced log entries reply-shards: 2 # The number of threads to send replies (both fast/slow replies) index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them index-sync-period-us: 50 # For every period, the leader replica will send a index sync message to followers, serves for slow path and heartbeat request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower master-port: 34333 # This port is mainly used to send/receive other messages monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal main-loop-period-ms: 20 heartbeat-threshold-ms: 500 index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests) view-change-period-ms: 10 state-transfer-period-ms: 10 state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange index-transfer-batch: 30 request-key-transfer-batch: 60 request-transfer-batch: 5 crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies recovery-request-period-ms: 10 sync-report-period-ms: 10 key-num: 1000000 # When commutativity is enabled for test, we need the specified key number to detect commutativity moving-percentile: 0.50 # the percentile used to estimate owd owd-estimation-window: 1000 # We use a sliding window to get moving median of one-way delays, here is the length of the window reclaim-timeout-ms: 5000 # To save memory, those requests who enter the late buffer will not stay there forever: if they have stayed for so long, they will be reclaimed. Similary, for unsynced log entries, if (1) they have been kept for more than reclaim-timeout-ms (2) they are not used by the worker threads; then they will be reclaimed ================================================ FILE: configs/local/nezha-replica-config-1.yaml ================================================ --- print-config: true replica-ips: - "127.0.0.1" - "127.0.0.2" - "127.0.0.3" endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon] replica-id: 1 receiver-shards: 1 # The number of threads to receive threads record-shards: 1 # The number of threads to record requests in the global concurrent map track-shards: 1 # The number of threads to record synced log entries reply-shards: 2 # The number of threads to send replies (both fast/slow replies) index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them index-sync-period-us: 50 # For every period, the leader replica will send a index sync message to followers, serves for slow path and heartbeat request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower master-port: 34333 # This port is mainly used to send/receive other messages monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal main-loop-period-ms: 20 heartbeat-threshold-ms: 500 index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests) view-change-period-ms: 10 state-transfer-period-ms: 10 state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange index-transfer-batch: 30 request-key-transfer-batch: 60 request-transfer-batch: 5 crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies recovery-request-period-ms: 10 sync-report-period-ms: 10 key-num: 1000000 # When commutativity is enabled for test, we need the specified key number to detect commutativity moving-percentile: 0.50 # the percentile used to estimate owd owd-estimation-window: 1000 # We use a sliding window to get moving median of one-way delays, here is the length of the window reclaim-timeout-ms: 5000 # To save memory, those requests who enter the late buffer will not stay there forever: if they have stayed for so long, they will be reclaimed. Similary, for unsynced log entries, if (1) they have been kept for more than reclaim-timeout-ms (2) they are not used by the worker threads; then they will be reclaimed ================================================ FILE: configs/local/nezha-replica-config-2.yaml ================================================ --- print-config: true replica-ips: - "127.0.0.1" - "127.0.0.2" - "127.0.0.3" endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon] replica-id: 2 receiver-shards: 1 # The number of threads to receive threads record-shards: 1 # The number of threads to record requests in the global concurrent map track-shards: 1 # The number of threads to record synced log entries reply-shards: 2 # The number of threads to send replies (both fast/slow replies) index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them index-sync-period-us: 50 # For every period, the leader replica will send a index sync message to followers, serves for slow path and heartbeat request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower master-port: 34333 # This port is mainly used to send/receive other messages monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal main-loop-period-ms: 20 heartbeat-threshold-ms: 500 index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests) view-change-period-ms: 10 state-transfer-period-ms: 10 state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange index-transfer-batch: 30 request-key-transfer-batch: 60 request-transfer-batch: 5 crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies recovery-request-period-ms: 10 sync-report-period-ms: 10 key-num: 1000000 # When commutativity is enabled for test, we need the specified key number to detect commutativity moving-percentile: 0.50 # the percentile used to estimate owd owd-estimation-window: 1000 # We use a sliding window to get moving median of one-way delays, here is the length of the window reclaim-timeout-ms: 5000 # To save memory, those requests who enter the late buffer will not stay there forever: if they have stayed for so long, they will be reclaimed. Similary, for unsynced log entries, if (1) they have been kept for more than reclaim-timeout-ms (2) they are not used by the worker threads; then they will be reclaimed ================================================ FILE: configs/nezha-client-config-template.yaml ================================================ --- print-config: true proxy-info: proxy-ips: - "127.0.0.4" proxy-shards: 12 request-port-base: 32000 client-info: endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon] client-id: 1 client-ip: "127.0.0.5" request-port: 32912 is-openloop: true poisson-rate: 60 # it means the client sends x reqs/10ms on average (should be larger than 10, otherwise, the submission rate is not accurate) duration-sec: 60 # it means the duration of the client runs (second) key-num: 1000000 # when key-num is 1, it means there is no commutativity optimization skew-factor: 0.5 # 0-0.99 The higher the zipfian factor is, the more skewed the workload is write-ratio: 0.5 # 0-1, the ratio of write requests request-retry-time-us: 100000 # After the request is submitted, if we cannot get the response after such long time, then we will retry ================================================ FILE: configs/nezha-proxy-config-template.yaml ================================================ --- print-config: true # Replica Info replica-info: replica-ips: - "127.0.0.1" - "127.0.0.2" - "127.0.0.3" receiver-shards: 2 # The number of threads to receive threads receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index initial-owd: 80 # The initial one-way delay (us) between replicas and proxies # Proxy Info proxy-info: endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon] proxy-id: 1 proxy-ip: "127.0.0.4" shard-num: 12 max-owd: 200 request-port-base: 32000 reply-port-base: 33000 ================================================ FILE: configs/nezha-replica-config-template.yaml ================================================ --- print-config: true replica-ips: - "127.0.0.1" - "127.0.0.2" - "127.0.0.3" endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon] replica-id: 0 receiver-shards: 2 # The number of threads to receive threads record-shards: 1 # The number of threads to record requests in the global concurrent map track-shards: 1 # The number of threads to record synced log entries process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance reply-shards: 3 # The number of threads to send replies (both fast/slow replies) index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them index-sync-period-us: 50 # For every period, the leader replica will send a index sync message to followers, serves for slow path and heartbeat request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower master-port: 34333 # This port is mainly used to send/receive other messages monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal main-loop-period-ms: 20 heartbeat-threshold-ms: 500 index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests) view-change-period-ms: 10 state-transfer-period-ms: 10 state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange index-transfer-batch: 30 request-key-transfer-batch: 60 request-transfer-batch: 5 crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies recovery-request-period-ms: 10 sync-report-period-ms: 10 key-num: 1000000 # When commutativity is enabled for test, we need the specified key number to detect commutativity moving_percentile: 0.90 # the percentile used to estimate owd owd-estimation-window: 1000 # We use a sliding window to get moving median of one-way delays, here is the length of the window reclaim-timeout-ms: 5000 # To save memory, those requests who enter the late buffer will not stay there forever: if they have stayed for so long, they will be reclaimed. Similary, for unsynced log entries, if (1) they have been kept for more than reclaim-timeout-ms (2) they are not used by the worker threads; then they will be reclaimed ================================================ FILE: docs/Nezha.tla ================================================ `^\textbf{\large N TLA+ Specification}\\^' ------------------------------ MODULE Nezha ---------------------------------- EXTENDS Naturals, TLC, FiniteSets, Sequences -------------------------------------------------------------------------------- (* `^\textbf{\large Bounds for Model Check [Configurable]}^' *) \* Time Range [Configurable] MaxTime == 3 \* Each client is only allowed to submit MaxReqNum requests [Configurable] \* In the specification, we will only consider two roles, client and replicas \* (i.e. it can be considered as co-locating one proxy with one client) \* For the proxy-based design, we just need to replace client with proxy, \* and then the specification describes the interaction between proxy and replicas MaxReqNum == 1 \* The leader is only allowed to crash when the view < MaxViews [Configurable] MaxViews == 3 \* These variables are used to implment at-most-once primitives \* i.e. The variables record the messages processed by Replicas/Clients, so \* that the Replicas/Clients will not process twice VARIABLE vReplicaProcessed, \* Messages that have been processed by replicas vClientProcessed \* Messages that have been processed by clients VARIABLE DebugAction (* `^\textbf{\large Constants}^' *) \* The set of replicas and an ordering of them CONSTANTS Replicas, ReplicaOrder, Clients, LatencyBounds ASSUME IsFiniteSet(Replicas) ASSUME ReplicaOrder \in Seq(Replicas) F == (Cardinality(Replicas) - 1) \div 2 ceilHalfF == IF (F \div 2) * 2 = F THEN F \div 2 ELSE (F+1) \div 2 floorHalfF == F \div 2 QuorumSize == F + 1 FastQuorumSize == F + ceilHalfF + 1 RecoveryQuorumSize == ceilHalfF + 1 FastQuorums == {R \in SUBSET(Replicas) : Cardinality(R) >= FastQuorumSize } Quorums == {R \in SUBSET(Replicas) : Cardinality(R) * 2 > Cardinality(Replicas)} \* Replica Statuses StNormal == 1 StViewChange == 2 StRecovering == 3 \* Message Types MClientRequest == 1 \* Sent by client to replicas MFastReply == 2 \* Fast Reply Message MSlowReply == 3 \* Slow Reply Message MLogIndex == 4 \* LogIndex MLogEntry == 5 \* Log entry, different from index, it includes command field, which can be large in practice MIndexSync == 6 \* Sync message during the index sync process MMissEntryRequest == 7 \* Sent by followers once they fail to find the entry on itself MMissEntryReply == 8 \* Response to MMissEntryRequest, providing the missing entries MViewChangeReq == 9 \* Sent when leader/sequencer failure detected MViewChange == 10 \* Sent to ACK view change MStartView == 11 \* Sent by new leader to start view \* The following messages are mainly used for periodic sync \* Just as described in NOPaxos, it is an optional optimization to enable fast recovery after failure MSyncPrepare == 12 \* Sent by the leader to ensure log durability MSyncRep == 13 \* Sent by followers as ACK MSyncCommit == 14 \* Sent by leaders to indicate stable log \* The following messages are mainly used for replica recovery MCrashVectorReq == 15 MCrashVectorRep == 16 MRecoveryReq == 17 MRecoveryRep == 18 MStateTransferReq == 19 MStateTransferRep == 20 (* `^\textbf{Message Schemas}^' ViewIDs == [ leaderNum |-> n \in (1..) ] \* uniquely identifies one request on one replica \* But across replicas, the same may have different deadlines \* (the leader may modify the deadline to make the request eligible to enter the early-buffer) \* so uniquely identifes one request across replicas ClientRequest [ mtype |-> MClientRequest, sender |-> c \in Clients, dest |-> r \in Replicas, requestID |-> i \in (1..), command |-> "", s |-> t \in (1..MaxTime), l |-> l \in (1..MaxBound) ] \* logSlotNum is not necessary and it is not described in the paper \* Here we include logSlotNum in FastReply and SlowReply messages \* to facilitate the check of Linearizability invariant FastReply [ mtype |-> MFastReply, sender |-> r \in Replicas, dest |-> c \in Clients, viewID |-> v \in ViewIDs, requestID |-> i \in (1..vClientReqNum) hash |-> [ log |-> vLogs[1..n], cv |-> crashVector ] deadline |-> i \in (1..MaxTime+MaxBound), logSlotNum |-> n \in (1..) ] SlowReply [ mtype |-> MSlowReply, sender |-> r \in Replicas, dest |-> c \in Clients, viewID |-> v \in ViewIDs, requestID |-> i \in (1..vClientReqNum) logSlotNum |-> n \in (1..) ] LogIndex [ mtype |-> MLogIndex, clientID |-> c \in Clients, requestID |-> i \in (1..vClientReqNum), deadline |-> i \in (1..MaxTime+MaxBound), ] LogEntry [ mtype |-> MLogEntry, clientID |-> c \in Clients, requestID |-> i \in (1..vClientReqNum), deadline |-> i \in (1..MaxTime+MaxBound), command |-> "" ] IndexSync [ mtype |-> MIndexSync, sender |-> r \in Replicas, dest |-> c \in Clients, viewID |-> v \in ViewIDs, logindcies |-> index \in vLogs[leaderIdx] ] MMissEntryRequest [ mtype |-> MMissEntryRequest, sender |-> r \in Replicas, dest |-> d \in Replicas, viewID |-> v \in ViewIDs, miss |-> {log indices} ] MMissEntryRequest [ mtype |-> MMissEntryReply, sender |-> r \in Replicas, dest |-> d \in Replicas, viewID |-> v \in ViewIDs, entries |-> {log entries} ] ViewChangeReq [ mtype |-> MViewChangeReq, sender |-> r \in Replicas, dest |-> r \in Replicas, viewID |-> v \in ViewIDs, cv |-> crash vector ] ViewChange [ mtype |-> MViewChange, sender |-> r \in Replicas, dest |-> r \in Replicas, viewID |-> v \in ViewIDs, lastNormal |-> v \in ViewIDs, log |-> l \in vLogs[1..n], cv |-> crash vector ] StartView [ mtype |-> MStartView, dest |-> r \in Replicas, viewID |-> v \in ViewIDs, log |-> l \in vLogs[1..n], cv |-> crash vector ] SyncPrepare [ mtype |-> MSyncPrepare, dest |-> r \in Replicas, sender |-> r \in Replicas, viewID |-> v \in ViewIDs, log |-> l \in vLogs[1..n] ] SyncRep [ mtype |-> MSyncRep, dest |-> r \in Replicas, sender |-> r \in Replicas, viewID |-> v \in ViewIDs, logSlotNumber |-> n \in (1..) ] SyncCommit [ mtype |-> MSyncCommit, dest |-> r \in Replicas, sender |-> r \in Replicas, viewID |-> v \in ViewIDs, log |-> l \in vLogs[1..n] ] CrashVectorReq [ mtype |-> MCrashVectorReq, sender |-> r \in Replicas, dest |-> r \in Replicas, nonce |-> nonce ] CrashVectorRep [ mtype |-> MCrashVectorRep, sender |-> r \in Replicas, dest |-> r \in Replicas, nonce |-> nonce, cv |-> vector of counters ] RecoveryReq [ mtype |-> MRecoveryReq, sender |-> r \in Replicas, dest |-> r \in Replicas, cv |-> vector of counters ] RecoveryRep [ mtype |-> MRecoveryRep, sender |-> r \in Replicas, dest |-> r \in Replicas, viewID |-> v \in ViewIDs, cv |-> vector of counters ] StateTransferReq [ mtype |-> MStateTransferReq, sender |-> r \in Replicas, dest |-> r \in Replicas, cv |-> vector of counters ] StateTransferRep [ mtype |-> MStateTransferRep, sender |-> r \in Replicas, dest |-> r \in Replicas, viewID |-> v \in ViewIDs, log |-> l \in vLogs[1..n] ], cv |-> vector of counters ] *) -------------------------------------------------------------------------------- (* `^\textbf{\large Variables}^' *) \* `^\textbf{Network State}^' VARIABLE messages \* Set of all messages sent networkVars == << messages >> InitNetworkState == messages = {} \* Used as a dummy value NULLLog == [ deadline |-> 0, clientID |-> 0, requestID |-> 0 ] \* `^\textbf{Replica State}^' VARIABLES vLog, \* Log of values vEarlyBuffer, \* The early buffer to hold request, \* and release it after clock passes its deadline (s+l) vReplicaStatus, \* One of StNormal, StViewChange, StRecovering vViewID, \* Current viewID replicas recognize vReplicaClock, \* Current Time of the replica vLastNormView, \* Last views in which replicas had status StNormal vViewChanges, \* Used for logging view change votes vSyncPoint, \* Latest synchronization point, \* to which the replica state (vLog) is consistent with the leader. vLateBuffer, \* The late buffer Used to store the requests \* which are not eligible to enter vEarlyBuffer vTentativeSync, \* Used by leader to mark current syncPrepare point (during periodic sync process) \* (Actually, vSyncPoint and vTentativeSync can be merged into one Var \* However, we decouple them to make the spec easy to understand) vSyncReps, \* Used for logging sync reps at leader vCommitPoint, \* Different from vSyncPoint, \* vCommitPoint indicates that the logs before this point has been replicated to majority \* So followers can safely execute requests (log entries) up to vCommitPoint \* Refer to ``Acceleration of Recovery" para in Sec 6 vUUIDCounter, \* Locally unique string (for CrashVectorReq) vCrashVector, \* CrashVector, initialized as all-zero vector vCrashVectorReps,\* CrashVectorRep Set vRecoveryReps \* RecoveryRep Set replicaVars == << vLog, vEarlyBuffer, vViewID, vReplicaClock, vLastNormView, vViewChanges,vReplicaStatus, vSyncPoint, vLateBuffer, vTentativeSync, vSyncReps, vCommitPoint, vUUIDCounter, vCrashVector, vCrashVectorReps, vRecoveryReps>> InitReplicaState == /\ vLog = [ r \in Replicas |-> << >> ] /\ vEarlyBuffer = [ r \in Replicas |-> {} ] /\ vViewID = [ r \in Replicas |-> 1 ] \* 0 should also be okay /\ vReplicaClock = [ r \in Replicas |-> 1 ] /\ vLastNormView = [ r \in Replicas |-> 1 ] /\ vViewChanges = [ r \in Replicas |-> {} ] /\ vReplicaStatus = [ r \in Replicas |-> StNormal ] /\ vSyncPoint = [ r \in Replicas |-> 0 ] /\ vLateBuffer = [ r \in Replicas |-> {} ] /\ vTentativeSync = [ r \in Replicas |-> 0 ] /\ vSyncReps = [ r \in Replicas |-> {} ] /\ vCommitPoint = [ r \in Replicas |-> 0 ] /\ vCrashVector = [ r \in Replicas |-> [ rr \in Replicas |-> 0] ] /\ vCrashVectorReps= [ r \in Replicas |-> {} ] /\ vRecoveryReps = [ r \in Replicas |-> {} ] /\ vUUIDCounter = [ c \in Replicas |-> 0 ] \* `^\textbf{Client State}^' VARIABLES vClientClock, \* Current Clock Time of the client vClientReqNum \* The number of requests that have been sent by this client InitClientState == /\ vClientClock = [ c \in Clients |-> 1 ] /\ vClientReqNum = [ c \in Clients |-> 0 ] clientVars == << vClientClock, vClientReqNum >> \* `^\textbf{Set of all vars}^' vars == << networkVars, replicaVars, clientVars >> \*\* `^\textbf{Initial state}^' Init == /\ InitNetworkState /\ InitReplicaState /\ InitClientState /\ vReplicaProcessed = [ r \in Replicas |-> {} ] /\ vClientProcessed = [c \in Clients |-> {}] /\ DebugAction = <<"Init", "">> -------------------------------------------------------------------------------- (* `^\textbf{\large Helpers}^' *) NumofReplicas(status) == Cardinality({ r \in Replicas: vReplicaStatus[r] = status }) DuplicateRep(ReplySet,m) == m.sender \in { mm.sender : mm \in ReplySet } Pick(S) == CHOOSE s \in S : TRUE \* Convert a Set to Sequence RECURSIVE Set2Seq(_) Set2Seq(S) == IF Cardinality(S) = 0 THEN <<>> ELSE LET x == CHOOSE x \in S : TRUE IN <> \o Set2Seq(S \ {x}) \* Convert a Sequence to Set Seq2Set(seq) == { seq[i] : i \in DOMAIN seq } Max(S) == CHOOSE x \in S : \A y \in S : x >= y Min(S) == CHOOSE x \in S : \A y \in S : x <= y \* `^\textbf{View ID Helpers}^' LeaderID(viewID) == (viewID % Len(ReplicaOrder)) + (IF viewID >= Len(ReplicaOrder) THEN 1 ELSE 0) Leader(viewID) == ReplicaOrder[LeaderID(viewID)] \* remember <<>> are 1-indexed \* `^\textbf{Log Manipulation Helpers}^' \* The order of 2 log entries are decided by the tuple \* Usually, deadline makes the two entries comparable \* When 2 different entries have the same deadline, the tie is broken with clientID \* Further, the tie is broken is requestID \* (unnecessary if we only allow client to submit one request at one tick) EntryLeq(l1, l2) == /\ l1.deadline <= l2.deadline /\ l1.clientID <= l2.clientID /\ l1.requestID <= l2.requestID EntryEq(l1, l2) == /\ l1.deadline = l2.deadline /\ l1.clientID = l2.clientID /\ l1.requestID = l2.requestID EntryLessThan(l1, l2) == /\ EntryLeq(l1, l2) /\ ~(EntryEq(l1, l2)) \* Find entry in one replica's log ( can uniquely identify the log entry) \* We do not check deadline, because the leader may have modified the request's deadline \* Return 0 when we fail to find it (remember Sequence is 1-indexed in TLA+, so 0 can serve as a dummy value) FindEntry(clientID, reqID, log) == LET entryIndexSet == { i \in 1..Len(log): /\ log[i].clientID = clientID /\ log[i].reqID = reqID } IN IF Cardinality(entryIndexSet) = 0 THEN 0 ELSE Pick(entryIndexSet) SortLogSeq(seq) == SortSeq(seq, LAMBDA x, y: EntryLessThan(x, y) ) \* Given a set of logs, return the sorted log list GetSortLogSeq(S) == LET seq == Set2Seq(S) IN SortLogSeq(seq) (* Merge logs, first put all log items together, deduplicated (i.e. UNION them into a set). Then, do filtering and only keep those that have appeared in at least `^\left \lceil{f/2}\right \rceil +1^' replicas. *) CountVotes(logll, x) == Cardinality({ logSet \in logll : x \in logSet }) MergeUnSyncLogs(unSyncedLogs, lastSyncedLog) == LET unSyncedLogSet == UNION unSyncedLogs votedLogSet == {x \in unSyncedLogSet : /\ EntryLessThan(lastSyncedLog, x) /\ CountVotes(unSyncedLogs, x) >= RecoveryQuorumSize} IN GetSortLogSeq(votedLogSet) \* `^\textbf{Network Helpers}^' \* Add a message to the network Send(ms) == messages' = messages \cup ms \* Convert the request format to a log format (by summing up s and l to get deadline) Req2Log(req) == [ mtype |-> MLogEntry, deadline |-> req.s + req.l, clientID |-> req.sender, requestID |-> req.requestID, command |-> req.command ] \* Index does not need to include command field, which is the body of the request/log, and can be very large GetLogIndex(entry) == [ mtype |-> MLogIndex, deadline |-> entry.deadline, clientID |-> entry.clientID, requestID |-> entry.requestID ] GetLogIndexFromReply(reply) == [ mtype |-> MLogIndex, deadline |-> reply.deadline, clientID |-> reply.dest, requestID |-> reply.requestID ] IndexEq(index, msg) == /\ index.deadline = msg.deadline /\ index.clientID = msg.clientID /\ index.requestID = msg.requestID \* Add local time to the message (for easy debug) Msg2RLog(msg, r) == msg @@ [tl |-> vReplicaClock[r]] LastLog(logList) == IF Len(logList) = 0 THEN NULLLog ELSE logList[Len(logList)] MergeCrashVector(cv1, cv2)== [ r \in Replicas |-> Max({cv1[r], cv2[r]}) ] CheckCrashVector(m, r) == IF m.cv[m.sender] < vCrashVector[r][m.sender] THEN FALSE \* Potential stray message ELSE vCrashVector' = [ vCrashVector EXCEPT ![r] = MergeCrashVector(m.cv, vCrashVector[r])] FilterStrayMessage(MSet, cv) == {m \in MSet : m.cv[m.sender] >= cv[m.sender] } -------------------------------------------------------------------------------- (* `^\textbf{\large Message Handlers and Actions }^' *) \* `^\textbf{Client action}^' \* Client c sends a request \* We assume client can only send one request in one tick of time \* If time has reached the bound, this client cannot send request any more ClientSendRequest(c) == /\ vClientClock[c] < MaxTime /\ vClientReqNum[c] < MaxReqNum /\ Send({[ mtype |-> MClientRequest, sender |-> c, \* clientID requestID |-> vClientReqNum[c] + 1, \* requestID command |-> "", s |-> vClientClock[c], \* submission time l |-> LatencyBounds[c], \* latency bound dest |-> r ]: r \in Replicas }) /\ vClientClock' = [ vClientClock EXCEPT ![c] = vClientClock[c] + 1 ] /\ vClientReqNum' = [ vClientReqNum EXCEPT ![c] = vClientReqNum[c] +1 ] /\ UNCHANGED << replicaVars >> Duplicate(entry, logSet) == LET findSet == {x \in logSet : /\ x.clientID = entry.clientID /\ x.requestID = entry.requestID } IN Cardinality(findSet) > 0 \* Replica r receives MClientRequest, m HandleClientRequest(r, m) == LET mlog == Req2Log(m) IN \* If the request is duplicate, it will no longer be appended to the log \* Replicas simply reply the previous execution result of this request \* (we do not model execution in this spec) /\ ~Duplicate(mlog, Seq2Set(vLog[r]) \union vEarlyBuffer[r] ) /\ vReplicaStatus[r] = StNormal \* The request can enter the early buffer /\ \/ /\ EntryLessThan(LastLog(vLog[r]), mlog) /\ vEarlyBuffer' = [ vEarlyBuffer EXCEPT ![r] =vEarlyBuffer[r] \cup { mlog } ] /\ UNCHANGED << networkVars, clientVars, vLog, vViewID, vReplicaClock, vLastNormView, vViewChanges,vReplicaStatus, vSyncPoint, vLateBuffer, vTentativeSync, vSyncReps, vCommitPoint, vUUIDCounter, vCrashVector, vCrashVectorReps, vRecoveryReps >> \* (1) Followers' early buffers do not accept the request \* if its deadline is smaller than previously appended (last released) entry, \* so followers directly put the request into the late buffer \* (2) Leader modifies its deadline to be larger than the last released entry \* so as to make it eligible for entering the early buffer \/ /\ EntryLessThan(mlog, LastLog(vLog[r])) /\ IF r = Leader(vViewID[r]) THEN \* this replica is the leader in the current view /\ vEarlyBuffer' = [ vEarlyBuffer EXCEPT ![r] =vEarlyBuffer[r] \cup {[ mtype |-> MLogEntry, clientID |-> mlog.clientID, requestID |-> mlog.requestID, deadline |-> LastLog(vLog[r]).deadline + 1, command |-> mlog.command ]} ] /\ UNCHANGED << networkVars, clientVars, vLog, vViewID, vReplicaClock, vLastNormView, vViewChanges,vReplicaStatus, vSyncPoint, vLateBuffer, vTentativeSync, vSyncReps, vCommitPoint, vUUIDCounter, vCrashVector, vCrashVectorReps, vRecoveryReps >> ELSE \* this replica is a follower in the current view /\ vLateBuffer' = [ vLateBuffer EXCEPT ![r] =vLateBuffer[r] \cup { mlog } ] /\ UNCHANGED << networkVars, clientVars, vLog, vEarlyBuffer, vViewID, vReplicaClock, vLastNormView, vViewChanges,vReplicaStatus, vSyncPoint, vTentativeSync, vSyncReps, vCommitPoint, vUUIDCounter, vCrashVector, vCrashVectorReps, vRecoveryReps >> \* Release relevant requests from vEarlyBuffer and append to vLog, \* and then send a fast reply FlushEarlyBuffer(r) == LET validLogSet == {x \in vEarlyBuffer[r]: /\ x.deadline < vReplicaClock[r] \* < rather than <= /\ EntryLessThan(LastLog(vLog[r]), x) } validLogs == GetSortLogSeq(validLogSet) newLogStart == Len(vLog[r]) + 1 IN /\ vLog' = [vLog EXCEPT ![r] = vLog[r] \o validLogs ] /\ vEarlyBuffer' = [ vEarlyBuffer EXCEPT ![r] = {x \in vEarlyBuffer[r]: x.deadline >= vReplicaClock[r] } ] \* >= rather than > /\ Send({[ mtype |-> MFastReply, sender |-> r, dest |-> vLog'[r][i].clientID, viewID |-> vViewID[r], requestID |-> vLog'[r][i].requestID, hash |-> [ log |-> SubSeq(vLog'[r], 1, i), cv |-> vCrashVector ], deadline |-> vLog'[r][i].deadline, logSlotNum |-> i ] : i \in newLogStart..Len(vLog'[r])}) /\ IF r = Leader(vViewID[r]) THEN /\ vSyncPoint' = [ vSyncPoint EXCEPT ![r] = Len(vLog'[r]) ] /\ UNCHANGED << clientVars, vViewID, vLastNormView, vViewChanges, vReplicaStatus, vReplicaClock, vLateBuffer, vTentativeSync, vSyncReps, vCommitPoint, vUUIDCounter, vCrashVector, vCrashVectorReps, vRecoveryReps >> ELSE UNCHANGED << clientVars, vViewID, vLastNormView, vViewChanges, vReplicaStatus, vReplicaClock, vSyncPoint, vLateBuffer, vTentativeSync, vSyncReps, vCommitPoint, vUUIDCounter, vCrashVector, vCrashVectorReps, vRecoveryReps >> \* Clock can be random value (RandomElement(1..MaxTime)), \* because clock sync algorithm can give negative offset, or even fails \* But Nezha depend on clock for performance but not for correctness \* If the replica clock goes beyond MaxTime, it will stop processing \* Since Clock is moved, then replicas can release relevant requests and append to logs ReplicaClockMove(r) ==/\ IF vReplicaClock[r] < MaxTime THEN vReplicaClock' = [ vReplicaClock EXCEPT ![r] = RandomElement(1..MaxTime) ] ELSE UNCHANGED vReplicaClock /\ UNCHANGED << networkVars, clientVars, vLog, vEarlyBuffer,vViewID, vLastNormView, vViewChanges, vReplicaStatus, vSyncPoint, vLateBuffer, vTentativeSync, vSyncReps,vCommitPoint, vUUIDCounter, vCrashVector, vCrashVectorReps, vRecoveryReps >> \* Client clock move does not change any other things ClientClockMove(c) == /\ IF vClientClock[c] < MaxTime THEN vClientClock' = [ vClientClock EXCEPT ![c] = RandomElement(1..MaxTime) ] ELSE UNCHANGED vClientClock /\ UNCHANGED <> -------------------------------------------------------------------------------- \* `^\textbf{\large Index Synchronization to Fix Set Inequality}^' \* Leader replica r starts index synchronization StartIndexSync(r) == LET indices == { GetLogIndex(vLog[r][i]) : i \in 1..Len(vLog[r]) } IN /\ r = Leader(vViewID[r]) /\ vReplicaStatus[r] = StNormal /\ Cardinality(indices) > 0 \* leader has log entries to sync /\ Send({[ mtype |-> MIndexSync, sender |-> r, dest |-> d, viewID |-> vViewID[r], logindcies |-> indices ] : d \in Replicas }) /\ UNCHANGED << clientVars, replicaVars >> GetSyncLogs(logSeq, indices) == LET logSet == { l \in Seq2Set(logSeq) : \E index \in indices: EntryEq(index, l)} IN GetSortLogSeq(logSet) GetUnSyncLogs(logSeq, lastSyncedLog) == LET logSet == { l \in Seq2Set(logSeq) : EntryLessThan(lastSyncedLog, l) } IN GetSortLogSeq(logSet) \* Replica r receives IndexSync message, m HandleIndexSync(r, m) == /\ r /= Leader(vViewID[r]) /\ vReplicaStatus[r] = StNormal /\ m.viewID = vViewID[r] /\ m.sender = Leader(vViewID[r]) /\ vSyncPoint[r] < Len(m.logindcies) /\ LET entries == { vLog[r][i] : i \in 1..Len(vLog[r]) } indices == { GetLogIndex(vLog[r][i]) : i \in 1..Len(vLog[r]) } missedEntries == m.indices \ indices IN \* Missing some log entries -> Send MMissEntryRequest IF Cardinality(missedEntries) > 0 THEN /\ Send({[ mtype |-> MMissEntryRequest, sender |-> r, dest |-> d, viewID |-> vViewID[r], miss |-> missedEntries ] : d \in (Replicas \ {r} ) }) /\ UNCHANGED << vLog, vSyncPoint >> \* No missing entries, update vLog and vSyncPoint, and send relevant slow replies ELSE LET syncLogs == GetSyncLogs(vLog[r], indices) unsyncLogs == GetUnSyncLogs(vLog[r], LastLog(syncLogs)) IN /\ vLog' = [ vLog EXCEPT ![r] = syncLogs \o unsyncLogs ] /\ vSyncPoint' = [ vSyncPoint EXCEPT ![r] = Len(syncLogs) ] /\ Send({[ mtype |-> MSlowReply, sender |-> r, dest |-> vLog'[r][i].clientID, viewID |-> vViewID[r], requestID |-> vLog'[r][i].requestID, logSlotNum |-> i ] : i \in (1..Len(syncLogs))}) /\ UNCHANGED << clientVars, vEarlyBuffer, vViewID, vReplicaClock, vLastNormView, vViewChanges, vReplicaStatus, vLateBuffer, vTentativeSync, vSyncReps, vCommitPoint, vUUIDCounter, vCrashVector, vCrashVectorReps, vRecoveryReps>> FindEntries(log, indices)== { l \in Seq2Set(log) : \E x \in indices: IndexEq(l,x) } \* Replica r receives a request from other replicas, asking for a missing log entry HandleMissEntryRequest(r, m) == /\ m.viewID = vViewID[r] /\ LET findentries == FindEntries(vLog[r], m.miss) IN /\ Cardinality(findentries) > 0 /\ Send({[ mtype |-> MMissEntryReply, sender |-> r, dest |-> m.sender, viewID |-> vViewID[r], entries |-> findentries ]}) /\ UNCHANGED << clientVars, replicaVars >> \* Replica r receives a reply from other replicas, providing the missing entries HandleMissEntryReply(r, m) == /\ m.viewID = vViewID[r] /\ LET mergedSet == Seq2Set(vLog[r]) \union m.entries IN vLog' = [ vLog EXCEPT ![r] = GetSortLogSeq(mergedSet) ] /\ UNCHANGED << networkVars, clientVars, vEarlyBuffer,vViewID, vReplicaClock, vLastNormView, vViewChanges, vReplicaStatus, vSyncPoint, vLateBuffer, vTentativeSync,vSyncReps, vCommitPoint, vUUIDCounter, vCrashVector, vCrashVectorReps, vRecoveryReps >> -------------------------------------------------------------------------------- \* `^\textbf{\large Replica Rejoin}^' \* Failed replica loses all states StartReplicaFail(r) == /\ NumofReplicas(StRecovering) < F \* We assume at most F replicas can fail at the same time /\ vReplicaStatus' = [ vReplicaStatus EXCEPT ![r] = StRecovering ] /\ vLog' = [ vLog EXCEPT ![r] = <<>> ] /\ vEarlyBuffer' = [ vEarlyBuffer EXCEPT ![r] = {} ] /\ vViewID' = [vViewID EXCEPT![r] = 1 ] /\ vLastNormView' = [ vLastNormView EXCEPT ![r] = 1 ] /\ vViewChanges' = [ vViewChanges EXCEPT ![r] = {} ] /\ vSyncPoint' = [ vSyncPoint EXCEPT ![r] = 0 ] /\ vLateBuffer' = [ vLateBuffer EXCEPT ![r] = {} ] /\ vTentativeSync' = [ vTentativeSync EXCEPT ![r] = 0 ] /\ vSyncReps' = [ vSyncReps EXCEPT ![r] = {} ] /\ vCommitPoint' = [ vCommitPoint EXCEPT ![r] = 0 ] /\ vCrashVector' = [ vCrashVector EXCEPT ![r] = [ rr \in Replicas |-> 0] ] /\ vCrashVectorReps' = [ vCrashVectorReps EXCEPT ![r] = {} ] /\ vRecoveryReps' = [ vRecoveryReps EXCEPT ![r] = {} ] /\ UNCHANGED << vReplicaClock, vUUIDCounter, clientVars, networkVars >> \* Recovering replica starts recovery (by first sending CrashVectorReq) StartReplicaRecovery(r) == /\ vReplicaStatus[r] = StRecovering /\ vUUIDCounter' = [ vUUIDCounter EXCEPT ![r] = vUUIDCounter[r] + 1 ] /\ Send({[ mtype |-> MCrashVectorReq, sender |-> r, dest |-> d, nonce |-> vUUIDCounter'[r] ] : d \in Replicas}) /\ UNCHANGED << vLog, vEarlyBuffer, vViewID, vReplicaClock, vLastNormView, vViewChanges,vReplicaStatus, vSyncPoint, vLateBuffer, vTentativeSync, vSyncReps, vCommitPoint, vCrashVector, vCrashVectorReps, vRecoveryReps, clientVars >> HandleCrashVectorReq(r, m) == /\ vReplicaStatus[r] = StNormal /\ Send({[ mtype |-> MCrashVectorRep, sender |-> r, dest |-> m.sender, nonce |-> m.nonce, cv |-> vCrashVector[r] ]}) /\ UNCHANGED << replicaVars, clientVars >> HandleCrashVectorRep(r, m) == /\ vReplicaStatus[r] = StRecovering /\ vUUIDCounter[r] = m.nonce /\ Cardinality(vCrashVectorReps[r]) <= F /\ ~DuplicateRep(vCrashVectorReps[r],m) /\ vCrashVectorReps' = [ vCrashVectorReps EXCEPT ![r] = vCrashVectorReps[r] \cup {m} ] /\ vCrashVector' = [ vCrashVector EXCEPT ![r] = MergeCrashVector(vCrashVector[r], m.cv) ] /\ IF Cardinality(vCrashVectorReps') = F + 1 THEN \* got enough replies and can settle down cv Send({[ mtype |-> MRecoveryReq, sender |-> r, dest |-> d, nonce |-> m.nonce, cv |-> vCrashVector'[r] ]: d \in Replicas }) ELSE UNCHANGED << networkVars >> /\ UNCHANGED <> HandleRecoveryReq(r, m) == /\ vReplicaStatus[r] = StNormal /\ vCrashVector' = [ vCrashVector EXCEPT ![r] = MergeCrashVector(vCrashVector[r], m.cv) ] /\ Send({[ mtype |-> MRecoveryRep, sender |-> r, dest |-> m.sender, viewID |-> vViewID[r], cv |-> vCrashVector'[r] ]: d \in Replicas }) /\ UNCHANGED << vLog, vEarlyBuffer, vViewID, vReplicaClock, vLastNormView, vViewChanges,vReplicaStatus, vSyncPoint, vLateBuffer, vTentativeSync, vSyncReps, vCommitPoint, vUUIDCounter, vCrashVectorReps, vRecoveryReps, clientVars >> HandleRecoveryRep(r, m) == /\ vReplicaStatus[r] = StRecovering /\ Cardinality(vRecoveryReps[r]) <= F /\ ~DuplicateRep(vRecoveryReps[r], m.sender) /\ CheckCrashVector(m, r) (* `~ /\ vRecoveryReps' = [ vRecoveryReps EXCEPT ![r] = vRecoveryReps[r] \cup {m} ] ~' *) \* Note: After crash vector is updated, those previously accepted messages may also become stray message. \* Those messages should also be filtered out. /\ vRecoveryReps' = [ vRecoveryReps EXCEPT ![r] = FilterStrayMessage(vRecoveryReps[r] \cup {m}, vCrashVector'[r] ) ] /\ IF Cardinality(vRecoveryReps') = F + 1 THEN \* got enough replies LET newView == Max({ mm.viewID : mm \in vRecoveryReps'[r] }) leaderId == newView % Cardinality(Replicas) IN Send({[ mtype |-> MStateTransferReq, sender |-> r, dest |-> leaderId, cv |-> vCrashVector'[r] ]: d \in Replicas }) ELSE UNCHANGED << networkVars >> /\ UNCHANGED <> HandleStateTransferReq(r, m) == /\ vReplicaStatus[r] = StNormal /\ CheckCrashVector(m, r) /\ Send({[ mtype |-> MStateTransferRep, sender |-> r, dest |-> m.sender, log |-> vLog[r], sp |-> vSyncPoint[r], cp |-> vCommitPoint[r], cv |-> vCrashVector'[r] ]}) /\ UNCHANGED << vLog, vEarlyBuffer, vViewID, vReplicaClock, vLastNormView, vViewChanges,vReplicaStatus, vSyncPoint, vLateBuffer, vTentativeSync, vSyncReps, vCommitPoint, vUUIDCounter, vCrashVectorReps, vRecoveryReps, clientVars >> HandleStateTransferRep(r, m) == /\ vReplicaStatus[r] = StRecovering /\ CheckCrashVector(m, r) /\ vLog' = [ vLog EXCEPT ![r] = m.log ] /\ vSyncPoint' = [ vSyncPoint EXCEPT ![r] = m.sp ] /\ vCommitPoint' = [ vCommitPoint EXCEPT ![r] = m.cp ] /\ vViewID' = [ vViewID EXCEPT ![r] = m.viewID ] /\ vEarlyBuffer' = [ vEarlyBuffer EXCEPT ![r] = {} ] /\ vLastNormView' = [ vLastNormView EXCEPT ![r] = m.viewID ] /\ vViewChanges' = [vViewChanges EXCEPT ![r] = {} ] /\ vReplicaStatus' = [ vReplicaStatus EXCEPT ![r] = StNormal ] /\ vLateBuffer' = [ vLateBuffer EXCEPT ![r] = {} ] /\ vTentativeSync' = [ vTentativeSync EXCEPT ![r] = m.sp ] /\ vSyncReps' = [ vSyncReps EXCEPT ![r] = {} ] /\ vCrashVectorReps' = [ vCrashVectorReps EXCEPT ![r] = {} ] /\ vRecoveryReps' = [ vRecoveryReps EXCEPT ![r]= {} ] /\ UNCHANGED << vReplicaClock, vUUIDCounter, clientVars >> -------------------------------------------------------------------------------- \* `^\textbf{\large Leader Change}^' \* Replica r starts a Leader change StartLeaderChange(r) == /\ Send({[ mtype |-> MViewChangeReq, sender |-> r, dest |-> d, viewID |-> vViewID[r] + 1, cv |-> vCrashVector[r] ] : d \in Replicas}) /\ UNCHANGED << replicaVars, clientVars >> \* `^\textbf{View Change Handlers}^' \* Replica r gets MViewChangeReq, m HandleViewChangeReq(r, m) == LET currentViewID == vViewID[r] newViewID == Max({currentViewID, m.viewID}) newLeaderNum == LeaderID(newViewID) IN \* Recovering replica does not participate in view change /\ vReplicaStatus[r] /= StRecovering /\ currentViewID /= newViewID /\ CheckCrashVector(m, r) /\ vReplicaStatus' = [ vReplicaStatus EXCEPT ![r] = StViewChange ] /\ vViewID' = [ vViewID EXCEPT ![r] = newViewID ] /\ vViewChanges' = [ vViewChanges EXCEPT ![r] = {} ] /\ Send({[ mtype |-> MViewChange, dest |-> Leader(newViewID), sender |-> r, viewID |-> newViewID, lastNormal |-> vLastNormView[r], syncedLog |-> SubSeq(vLog[r], 1, vSyncPoint[r]), unsyncedLog|-> SubSeq(vLog[r], vSyncPoint[r]+1, Len(vLog[r])), cv |-> vCrashVector[r] ]} \cup \* Send the MViewChangeReqs in case this is an entirely new view {[ mtype |-> MViewChangeReq, sender |-> r, dest |-> d, viewID |-> newViewID, cv |-> vCrashVector[r] ] : d \in Replicas}) /\ UNCHANGED << clientVars, vLog, vEarlyBuffer, vReplicaClock, vLastNormView, vSyncPoint, vLateBuffer, vTentativeSync, vSyncReps, vCommitPoint, vUUIDCounter, vCrashVectorReps, vRecoveryReps >> \* Replica r receives MViewChange, m HandleViewChange(r, m) == \* Recovering replica does not participate in view change /\ vReplicaStatus[r] /= StRecovering \* Add the message to the log /\ vViewID[r] = m.viewID /\ vReplicaStatus[r] = StViewChange \* This replica is the leader /\ Leader(vViewID[r]) = r /\ CheckCrashVector(m, r) (* `~ /\ vViewChanges' = [ vViewChanges EXCEPT ![r] = vViewChanges[r] \cup {m}] ~' *) \* Note: Similar to vRecoveryReps, (potential) stray messages should be filtered out. /\ vViewChanges' = [ vViewChanges EXCEPT ![r] = FilterStrayMessage(vViewChanges[r] \cup {m}, vCrashVector'[r]) ] \* If there's enough replies, start the new view /\ LET isViewPromise(M) == /\ { n.sender : n \in M } \in Quorums /\ \E n \in M : n.sender = r vCMs == { n \in vViewChanges'[r] : /\ n.mtype = MViewChange /\ n.viewID = vViewID[r] } \* Create the state for the new view normalViews == { n.lastNormal : n \in vCMs } \* Choose the largest normal view (i.e. the newest) lastNormal == (CHOOSE v \in normalViews : \A v2 \in normalViews : v2 <= v) \* For logs before vSyncPoint (i.e. syncedLog), we directly copy from the bestCandiates \* For unsyncedLog, we do quorum check to decide which ones should be added to recovery Log goodCandidates == { o \in vCMs : o.lastNormal = lastNormal } \* bestCandidate can only be picked from goodCandidates, \* because previous views may include invalid logs bestCandidate == CHOOSE n \in goodCandidates: \A y \in goodCandidates: Len(n.syncedLog) >= Len(y.syncedLog) unSyncedLogs == { Seq2Set(n.unsyncedLog) : n \in goodCandidates } IN IF isViewPromise(vCMs) THEN Send({[ mtype |-> MStartView, dest |-> d, viewID |-> vViewID[r], log |-> bestCandidate.syncedLog \o MergeUnSyncLogs(unSyncedLogs, LastLog(bestCandidate.syncedLog)) ] : d \in Replicas }) ELSE UNCHANGED networkVars /\ UNCHANGED << clientVars, vLog, vEarlyBuffer, vViewID, vReplicaClock, vLastNormView, vReplicaStatus, vSyncPoint, vLateBuffer, vTentativeSync, vSyncReps,vCommitPoint, vUUIDCounter, vCrashVectorReps, vRecoveryReps >> \* Replica r receives a MStartView, m HandleStartView(r, m) == /\ vReplicaStatus[r] /= StRecovering /\ \/ vViewID[r] < m.viewID \/ vViewID[r] = m.viewID /\ vReplicaStatus[r] = StViewChange /\ CheckCrashVector(m, r) /\ vLog' = [ vLog EXCEPT ![r] = m.log ] /\ vReplicaStatus' = [ vReplicaStatus EXCEPT ![r] = StNormal ] /\ vViewID' = [ vViewID EXCEPT ![r] = m.viewID ] /\ vLastNormView' = [ vLastNormView EXCEPT ![r] = m.viewID ] /\ vEarlyBuffer' = [ vEarlyBuffer EXCEPT ![r] = {} ] \* clear Early Buffer for the new view /\ vLateBuffer' = [ vLateBuffer EXCEPT ![r] = {}] \* clear Late Buffer for the new view /\ vSyncPoint' = [ vSyncPoint EXCEPT![r] = Len(m.log) ] /\ vTentativeSync' = [ vTentativeSync EXCEPT![r] = Len(m.log) ] \* Send replies (in the new view) for all log items /\ IF r = Leader(m.viewID) THEN \* Leader only sends fast reply Send({[ mtype |-> MFastReply, sender |-> r, dest |-> m.log[i].clientID, viewID |-> m.viewID, requestID |-> m.log[i].requestID, hash |-> [ log |-> SubSeq(m.log, 1, i), cv |-> vCrashVector ], deadline |-> m.log[i].deadline, logSlotNum |-> i ] : i \in (1..Len(m.log))}) ELSE \* While staring view, followers knows the log is synced with the leader, so send slow-reply Send({[ mtype |-> MSlowReply, sender |-> r, dest |-> m.log[i].clientID, viewID |-> m.viewID, requestID |-> m.log[i].requestID, logSlotNum |-> i ] : i \in (1..Len(m.log))}) /\ UNCHANGED << clientVars, vReplicaClock, vViewChanges, vSyncReps, vCommitPoint, vCrashVector, vUUIDCounter, vCrashVectorReps, vRecoveryReps >> -------------------------------------------------------------------------------- \* `^\textbf{\large Periodic Synchronization}^' \* Leader replica r conduct synchronization periodically \* This periodic sync process is different from index sync process \* It ensures that all replicas’ logs are stable up to their CommitPoint (for fast recovery) \* Our CommitPoint is essentially the `^\emph{sync-point}^' defined in NOPaxos paper \* Just as mentioned in NOPaxos paper, it is an optional optimization for fast recovery \* Nezha still works even without this part StartSync(r) == /\ Leader(vViewID[r]) = r /\ vReplicaStatus[r] = StNormal /\ vTentativeSync[r] < Len(vLog[r]) \* If >= then no need to sync /\ vSyncReps' = [ vSyncReps EXCEPT ![r] = {} ] /\ vTentativeSync' = [ vTentativeSync EXCEPT ![r] = Len(vLog[r]) ] /\ Send({[ mtype |-> MSyncPrepare, sender |-> r, dest |-> d, viewID |-> vViewID[r], log |-> vLog[r] ] : d \in Replicas }) /\ UNCHANGED << clientVars, vLog, vEarlyBuffer, vViewID, vReplicaClock, vLastNormView, vViewChanges, vReplicaStatus, vSyncPoint, vLateBuffer, vCommitPoint, vUUIDCounter, vCrashVector, vCrashVectorReps, vRecoveryReps >> \* Replica r receives MSyncPrepare, m HandleSyncPrepare(r, m) == LET newLog == m.log \o GetUnSyncLogs(vLog[r], LastLog(m.log) ) IN /\ vReplicaStatus[r] = StNormal /\ m.viewID = vViewID[r] /\ m.sender = Leader(vViewID[r]) /\ IF vSyncPoint[r] < Len(m.log) THEN /\ vSyncPoint' = [vSyncPoint EXCEPT ![r] = Len(m.log)] /\ vLog' = [ vLog EXCEPT ![r] = newLog ] /\ Send({[ mtype |-> MSlowReply, sender |-> r, dest |-> m.log[i].clientID, viewID |-> m.viewID, requestID |-> m.log[i].requestID, logSlotNum |-> i ] : i \in (1..Len(m.log))}) ELSE UNCHANGED <> /\ Send({[ mtype |-> MSyncRep, sender |-> r, dest |-> m.sender, viewID |-> vViewID[r], logSlotNumber |-> Len(m.log) ]} ) /\ UNCHANGED <> \* Replica r receives MSyncRep, m HandleSyncRep(r, m) == /\ m.viewID = vViewID[r] /\ vReplicaStatus[r] = StNormal /\ vSyncReps' = [ vSyncReps EXCEPT ![r] = vSyncReps[r] \cup { m } ] /\ LET isViewPromise(M) == /\ { n.sender : n \in M } \in Quorums /\ \E n \in M : n.sender = r sRMs == { n \in vSyncReps'[r] : /\ n.mtype = MSyncRep /\ n.viewID = vViewID[r] /\ n.logSlotNumber = vTentativeSync[r] } committedLog == IF vTentativeSync[r] >= 1 THEN SubSeq(vLog[r], 1, vTentativeSync[r]) ELSE << >> IN IF isViewPromise(sRMs) THEN /\ Send({[ mtype |-> MSyncCommit, sender |-> r, dest |-> d, viewID |-> vViewID[r], log |-> committedLog] : d \in Replicas }) /\ vCommitPoint' = [ vCommitPoint EXCEPT ![r] = vTentativeSync[r] ] ELSE UNCHANGED << networkVars, vCommitPoint >> /\ UNCHANGED << clientVars, vLog, vEarlyBuffer, vViewID, vReplicaClock, vLastNormView, vViewChanges, vReplicaStatus, vSyncPoint, vLateBuffer, vTentativeSync, vUUIDCounter, vCrashVector, vCrashVectorReps, vRecoveryReps >> \* Replica r receives MSyncCommit, m HandleSyncCommit(r, m) == LET newLog == m.log \o GetUnSyncLogs(vLog[r], LastLog(m.log) ) IN /\ vReplicaStatus[r] = StNormal /\ m.viewID = vViewID[r] /\ m.sender = Leader(vViewID[r]) /\ IF Len(m.log) <= vCommitPoint[r] THEN UNCHANGED <> ELSE /\ vLog' = [ vLog EXCEPT ![r] = newLog ] /\ vCommitPoint' = [ vCommitPoint EXCEPT ![r] = Len(m.log) ] /\ Send({[ mtype |-> MSlowReply, sender |-> r, dest |-> m.log[i].clientID, viewID |-> m.viewID, requestID |-> m.log[i].requestID, logSlotNum |-> i ] : i \in (1..Len(m.log))}) /\ UNCHANGED << networkVars, clientVars, vEarlyBuffer, vViewID, vReplicaClock, vLastNormView, vViewChanges, vReplicaStatus, vSyncPoint, vLateBuffer, vTentativeSync, vSyncReps, vUUIDCounter, vCrashVector, vCrashVectorReps, vRecoveryReps >> -------------------------------------------------------------------------------- (* `^\textbf{\large Invariants and Helper Functions}^' *) (* A request/log is committed in two possible cases: (1) A fast quorum has sent either slow-reply messages, or fast-reply messages with consistent hashes [Fast Path] (2) A simple quorum has sent slow-reply messages [Slow Path] Both quorums should include the leader *) \* Check whether log is committed at position logSlotNum Committed(clientID, requestID, logSlotNum) == \* Fast path \/ \E M \in SUBSET ({m \in messages : /\ \/ m.mtype = MFastReply \/ m.mtype = MSlowReply /\ m.logSlotNum = logSlotNum /\ m.dest = clientID /\ m.requestID = requestID }) : \* Sent from a fast quorum /\ { m.sender : m \in M } \in FastQuorums \* Matching view-id /\ \E m1 \in M : \A m2 \in M : m1.viewID = m2.viewID \* One from the leader /\ \E m \in M : m.sender = Leader(m.viewID) \* Hash values are consistent /\ LET leaderReply == CHOOSE m \in M : m.sender = Leader(m.viewID) IN \A m1 \in M : IF m1.mtype = MFastReply THEN m1.hash = leaderReply.hash ELSE TRUE \* SlowReply has consistent hash for sure \* Slow path \/ \E M \in SUBSET ({m \in messages : /\ \/ m.mtype = MSlowReply \/ /\ m.mtype = MFastReply \* Leader only sends fast-reply /\ m.sender =Leader(m.viewID) /\ m.logSlotNum = logSlotNum /\ m.dest = clientID /\ m.requestID = requestID }) : /\ { m.sender : m \in M } \in Quorums \* Matching view-id /\ \E m1 \in M : \A m2 \in M : m1.viewID = m2.viewID \* One from the leader /\ \E m \in M : m.sender = Leader(m.viewID) \* Check whether log is committed in view viewID CommittedInView(clientID, requestID, viewID) == \* Fast path \/ \E M \in SUBSET ({m \in messages : /\ \/ m.mtype = MFastReply \/ m.mtype = MSlowReply /\ m.dest = clientID /\ m.requestID = requestID /\ m.viewID = viewID}) : \* Sent from a fast quorum /\ { m.sender : m \in M } \in FastQuorums \* One from the leader /\ \E m \in M : m.sender = Leader(m.viewID) \* Hash values are the same /\ LET leaderReply == CHOOSE m \in M : m.sender = Leader(m.viewID) IN \A m1 \in M : IF m1.mtype = MFastReply THEN m1.hash = leaderReply.hash ELSE TRUE \* SlowReply has consistent hash for sure \* Slow path \/ \E M \in SUBSET ({m \in messages : /\ \/ m.mtype = MSlowReply \/ /\ m.mtype = MFastReply \* Leader only sends fast-reply /\ m.sender = Leader(m.viewID) /\ m.dest = clientID /\ m.requestID = requestID /\ m.viewID = viewID}) : /\ { m.sender : m \in M } \in Quorums \* Hash values are the same /\ \E m1 \in M : \A m2 \in M : m1.hash = m2.hash \* One from the leader /\ \E m \in M : m.sender = Leader(m.viewID) SystemRecovered(viewID) == /\ \E RM \in SUBSET(Replicas): /\ Cardinality(RM) >= QuorumSize /\ \A r \in RM: vLastNormView[r] >= viewID /\ \A r \in RM: vReplicaStatus[r] = StNormal \* These replicas must be normal \* The leader of this view has also recovered or even goes beyond this view /\ vLastNormView[Leader(viewID)] >= viewID (* `^\textbf{Invariants}^' *) \* Durability: Committed Requests always survive failure \* i.e. If a request is committed in one view, then it will remain committed in the higher views \* One thing to note, the check of "committed" only happens when the system is still "normal" \* While the system is under recovery (i.e. less than f+1 replicas are normal), \* the check of committed does not make sense Durability == \A v1, v2 \in 1..MaxViews: \* If a request is committed in lower view (v1,), \* it is impossible to make this request uncommited in higher view (v2) ~(/\ v1 < v2 \* To check Durability of request in higher views, \* the system should have entered the higher views /\ SystemRecovered(v2) /\ \E c \in Clients : \E r \in 1..MaxReqNum: /\ CommittedInView(c,r, v1) /\ ~CommittedInView(c,r, v2)) \* Consistency: Committed requests have the same history even after view changes \* i.e. If a request is committed in a lower view (v1), then (based on Durability Property) \* it remains committed in higher view (v2) \* Consistency requires the history of the request (i.e. all the request before this request) remain the same Consistency == \A v1, v2 \in 1..MaxViews: ~(/\ v1 < v2 \* To check Consistency of request in higher views, \* the system should have entered the higher views /\ SystemRecovered(v2) /\ \E c \in Clients : \E r \in 1..MaxReqNum: \E t \in 1..MaxTime: \* Durability has been checked in another invariant /\ CommittedInView(c,r, v1) /\ CommittedInView(c,r, v2) /\ LET v1LeaderReply == CHOOSE m \in messages: /\ m.mtype = MFastReply /\ m.deadline = t /\ m.dest = c /\ m.requestID = r /\ m.viewID = v1 /\ m.sender = Leader(v1) v2LeaderReply == CHOOSE m \in messages: /\ m.mtype = MFastReply /\ m.deadline = t /\ m.dest = c /\ m.requestID = r /\ m.viewID = v2 /\ m.sender = Leader(v2) IN v1LeaderReply.hash /= v2LeaderReply.hash) \* Linearizability: Only one request can be committed for a given position \* i.e. If one request has committed at position i, then no contrary observation can be made \* i.e. there cannot be a second request committed at the same position Linearizability == LET maxLogPosition == Max({1} \cup { m.logSlotNum : m \in {m \in messages : \/ m.mtype = MFastReply \/ m.mtype = MSlowReply } }) IN ~(\E c1, c2 \in Clients : \E r1, r2 \in 1..MaxReqNum: /\ << c1, r1 >> /= << c2, r2 >> /\ \E i \in (1 .. maxLogPosition) : /\ Committed(c1, r1, i) /\ Committed(c2, r2, i) ) (* `~ SyncSafety == \A r \in Replicas : \A i \in 1..vSyncPoint[r] : IF SystemRecovered(vViewID[r]) THEN \* Committed can only be checked when the system is recovered \* (i.e. when there are f+1 replicas alive) Committed(vLog[r][i].ta,vLog[r][i].clientID, vLog[r][i].reqID, i) ELSE TRUE ~' *) -------------------------------------------------------------------------------- (* `^\textbf{\large Main Transition Function}^' *) Next == \* Handle Messages \/ \E m \in messages : /\ m.mtype = MClientRequest /\ m \notin vReplicaProcessed[m.dest] /\ HandleClientRequest(m.dest, m) /\ vReplicaProcessed' = [vReplicaProcessed EXCEPT ![m.dest] = vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ] /\ UNCHANGED vClientProcessed /\ DebugAction' = << "HandleClientRequest", m >> \/ \E m \in messages : /\ m.mtype = MViewChangeReq /\ m \notin vReplicaProcessed[m.dest] /\ HandleViewChangeReq(m.dest, m) /\ vReplicaProcessed' = [vReplicaProcessed EXCEPT ![m.dest] = vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ] /\ UNCHANGED vClientProcessed /\ DebugAction' = << "HandleViewChangeReq", m >> \/ \E m \in messages : /\ m.mtype = MViewChange /\ m \notin vReplicaProcessed[m.dest] /\ HandleViewChange(m.dest, m) /\ vReplicaProcessed' = [vReplicaProcessed EXCEPT ![m.dest] = vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ] /\ UNCHANGED vClientProcessed /\ DebugAction' = << "HandleViewChange", m >> \/ \E m \in messages : /\ m.mtype = MStartView /\ m \notin vReplicaProcessed[m.dest] /\ HandleStartView(m.dest, m) /\ vReplicaProcessed' = [vReplicaProcessed EXCEPT ![m.dest] = vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ] /\ UNCHANGED vClientProcessed /\ DebugAction' = << "HandleStartView", m >> \/ \E m \in messages : /\ m.mtype = MSyncPrepare /\ m \notin vReplicaProcessed[m.dest] /\ HandleSyncPrepare(m.dest, m) /\ vReplicaProcessed' = [vReplicaProcessed EXCEPT ![m.dest] = vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ] /\ UNCHANGED vClientProcessed /\ DebugAction' = << "HandleSyncPrepare", m >> \/ \E m \in messages : /\ m.mtype = MSyncRep /\ m \notin vReplicaProcessed[m.dest] /\ HandleSyncRep(m.dest, m) /\ vReplicaProcessed' = [vReplicaProcessed EXCEPT ![m.dest] = vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ] /\ UNCHANGED vClientProcessed /\ DebugAction' = << "HandleSyncRep", m >> \/ \E m \in messages : /\ m.mtype = MSyncCommit /\ m \notin vReplicaProcessed[m.dest] /\ HandleSyncCommit(m.dest, m) /\ vReplicaProcessed' = [vReplicaProcessed EXCEPT ![m.dest] = vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ] /\ UNCHANGED vClientProcessed /\ DebugAction' = << "HandleSyncCommit", m >> \/ \E m \in messages: /\ m.mtype = MMissEntryRequest /\ m \notin vReplicaProcessed[m.dest] /\ HandleMissEntryRequest(m.dest, m) /\ vReplicaProcessed' = [vReplicaProcessed EXCEPT ![m.dest] = vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ] /\ UNCHANGED vClientProcessed /\ DebugAction' = << "HandleMissEntryRequest", m >> \/ \E m \in messages: /\ m.mtype = MMissEntryReply /\ m \notin vReplicaProcessed[m.dest] /\ HandleMissEntryReply(m.dest, m) /\ vReplicaProcessed' = [vReplicaProcessed EXCEPT ![m.dest] = vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ] /\ UNCHANGED vClientProcessed /\ DebugAction' = << "HandleMissEntryReply", m >> \* Client Actions \/ \E c \in Clients : /\ vClientReqNum[c] < MaxReqNum /\ ClientSendRequest(c) /\ UNCHANGED << vReplicaProcessed, vClientProcessed >> /\ DebugAction' = << "ClientSendRequest", "" >> \* Start Synchronization \/ \E r \in Replicas : /\ StartSync(r) /\ UNCHANGED << vReplicaProcessed, vClientProcessed >> /\ DebugAction' = << "StartSync", "" >> \* Replica Fail \/ \E r \in Replicas : /\ vReplicaStatus[r] = StNormal /\ StartReplicaFail(r) /\ UNCHANGED << vReplicaProcessed, vClientProcessed >> /\ DebugAction' = << "StartReplicaFail", "" >> \* Leader Change \/ \E r \in Replicas : /\ vViewID[r] < MaxViews /\ StartLeaderChange(r) /\ UNCHANGED << vReplicaProcessed, vClientProcessed >> /\ DebugAction' = << "StartLeaderChange", "" >> \* Replica Rejoin \/ \E r \in Replicas : /\ vReplicaStatus[r] = StRecovering /\ StartReplicaRecovery(r) /\ UNCHANGED << vReplicaProcessed, vClientProcessed >> /\ DebugAction' = << "StartReplicaRecovery", "" >> \* Replica Actions: \/ \E r \in Replicas: /\ StartIndexSync(r) /\ UNCHANGED << vReplicaProcessed, vClientProcessed >> /\ DebugAction' = << "StartIndexSync", "" >> \/ \E r \in Replicas: /\ FlushEarlyBuffer(r) /\ UNCHANGED << vReplicaProcessed, vClientProcessed >> /\ DebugAction' = << "FlushReplicaBuffer", "" >> \* Clock Move \/ \E r \in Replicas : /\ ReplicaClockMove(r) /\ UNCHANGED << vReplicaProcessed, vClientProcessed >> /\ DebugAction' = << "ReplicaClockMove", "" >> \/ \E c \in Clients : /\ ClientClockMove(c) /\ UNCHANGED << vReplicaProcessed, vClientProcessed >> /\ DebugAction' = << "ClientClockMove", "" >> ================================================================================ ================================================ FILE: docs/demo.md ================================================ ## One-Box Demo We have prepared the configuration files in ```configs``` folder, these configuration files will be used to launch 3 replicas, 1 proxy and 1 client. Under ```configs``` folder, we have ```local``` folder (for the single-machine test), containing: - nezha-replica-config-0.yaml - nezha-replica-config-1.yaml - nezha-replica-config-2.yaml - nezha-proxy-config.yaml - nezha-client-config.yaml When running distributed tests, the user can refer to the template files (e.g., ```configs/nezha-replica-config-template.yaml```) to generate their customized config files (such as configuring the IP addresses in the config files). Before running the experiment, we assume the user has generated and copied their configuration files into the ```$HOME/Nezha/configs``` folder. ### View Change Test **Step 1**: Launch 3 replicas (i.e. replica-0, replica-1, replica-2). Open 3 terminals and launch one replica in each terminal. ``` # In the first terminal (replica-0) $HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-0.yaml # In the second terminal (replica-1) $HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-1.yaml # In the third terminal (replica-2) $HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-2.yaml ``` ![Step-1](figs/nezha-vr-test-figs/step-1.png) **Step 2**: After the three replicas are launched, we can see the important information displayed from the console logs, e.g. the current view, the replica id of this replica, the number of replicas, the number of keys the maintained by each replica's state machine (for commutativity optimization) ![Step-2](figs/nezha-vr-test-figs/step-2.png) **Step 3**: In view 0, the leader replica is ```viewId%replicaNum=0```, i.e. replica-0. Therefore, if we kill replica-0, we will trigger view change, so we use Crtl+C to kill replica-0 ![Step-3](figs/nezha-vr-test-figs/step-3.png) **Step 4**: After leader is killed, the remaining 2 replicas start view change to enter a new view, i.e., view 1. In this new view, the leader becomes ```viewId%replicaNum=1```, i.e., replica-1. Since there are still a majority of replicas (i.e., 2 replicas) alive, the system can resume service. ![Step-4](figs/nezha-vr-test-figs/step-4.png) **Step 5**: We want the failed replica to rejoin the system. Therefore, we launch replica-0. This time, we set the flag ```isRecovering``` as true, so that it goes through the recovery procedure and retrieves the state from the other healthy replicas. ``` # In the first terminal $HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-0.yaml --isRecovering true ``` ![Step-5](figs/nezha-vr-test-figs/step-5.png) **Step 6**: We can see that replica-0 rejoins the system as a follower, and the current view is 1. ![Step-6](figs/nezha-vr-test-figs/step-6.png) The test process can be repeated. So long as there are always a majority of replicas (f+1) remaining, then the system is able to serve clients and failed replicas can also rejoin. ### Test with Client **Step 0**: Kill all the processes launched in the previous section. **Step 1**: Similar to the previous section, we launch 3 replicas. More than that, this time we also launch 1 proxy and 1 client. In the client configuration file (i.e. [nezha-client-config.yaml](configs/nezha-client-config.yaml) ), we have specified the client as an open-loop client, and it will submit at about 1000 requests/second. This time we need to open 5 terminals in total. ``` is-openloop: true poisson-rate: 10 ``` ``` # In the first terminal (replica-0) $HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-0.yaml # In the second terminal (replica-1) $HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-1.yaml # In the third terminal (replica-2) $HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-2.yaml # In the fourth terminal (proxy) $HOME/Nezha/bazel-bin/proxy/nezha_proxy --config $HOME/Nezha/configs/local/nezha-proxy-config.yaml # In the fifth terminal (client-1) $HOME/Nezha/bazel-bin/client/nezha_client --config $HOME/Nezha/configs/nezha-client-config.yaml ``` ![Step-1](figs/nezha-test-with-client/step-1.png) **Step 2**: After the client is launched, we can see it continues to submit requests and the proxy continues to forward requests for the client. For every 5 seconds, the client terminal will print a log to show the stats. ![Step-2](figs/nezha-test-with-client/step-2.png) **Step 3**: While the client is submitting requests, we kill the leader (i.e. replica-0), we can see that the remaining 2 replicas rapidly complete the view change and get the new leader, which takes about ```1657418951138477-1657418950947251=191226us=191ms```. It can complete the view change so fast, because of the optimization of periodical synchronization (which has been explained in our paper). Because of the periodical synchronization, the new leader replica does not need to do state transfer from scratch, it just needs to do state transfer and log merge from the last commit point. ![Step-3](figs/nezha-test-with-client/step-3.png) **Step 4**: We want the crashed replica (i.e. replica-0) to rejoin the system. So we set ```isRecovering``` flag as true. ``` # In the first terminal $HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-0.yaml --isRecovering true ``` ![Step-4](figs/nezha-test-with-client/step-4.png) **Step 5**: The crashed replica starts from an empty state, so it needs to retrieve all the log entries in order to recover. Since we are using UDP and by default only fetch 5 entries during each round, the state transfer can take some time if clients have submitted many entries. As shown in the terminal of replica-0, we also print the progress of the recovery. But note that the follower's recovery does not block the other healthy replicas from serving the client. An optional optimization in consideration is to generate snapshot periodically and dump to stable storage. In this way, when a crashed replica wants to recover, it first fetches the state from local storage, and then does state transfer. In this way, it can save the recovery time. ![Step-5](figs/nezha-test-with-client/step-5.png) **Step 6**: After replica-0 retrieves all the state, we can see it successfully recover and work as a follower. ![Step-6](figs/nezha-test-with-client/step-6.png) ================================================ FILE: docs/tla-intro.md ================================================ # Nezha TLA+ This repository includes a model-checked TLA+ specification (both the source file and the pdf version) for Nezha protocol. Besides, we also include a document to explain Nezha's recovery in pseudo-code. ================================================ FILE: external/gogoprotobuf.BUILD ================================================ package(default_visibility=['//visibility:public']) proto_library( name = "gogo_proto", srcs = ["gogoproto/gogo.proto"], deps = ["@com_google_protobuf//:descriptor_proto"] ) ================================================ FILE: external/googleapi.BUILD ================================================ package(default_visibility=['//visibility:public']) proto_library( name = 'annotations_proto', srcs = ['google/api/annotations.proto'], deps = [ ":http_proto", "@com_google_protobuf//:descriptor_proto" ], ) proto_library( name = 'http_proto', srcs = ['google/api/http.proto'] ) ================================================ FILE: lib/BUILD ================================================ load("@rules_proto//proto:defs.bzl", "proto_library") cc_library( name = "zipfian", srcs = ["zipfian.h"], hdrs = ["zipfian.h"], visibility = ["//visibility:public"], ) cc_library( name = "common_type", srcs = ["common_type.h"], hdrs = ["common_type.h"], visibility = ["//visibility:public"], ) cc_library( name = "common_struct", srcs = ["common_struct.h"], hdrs = ["common_struct.h"], visibility = ["//visibility:public"], deps = [ ":common_type", ], ) cc_library( name = "address", srcs = ["address.cc"], hdrs = ["address.h"], visibility = ["//visibility:public"], ) cc_library( name = "message_handler", srcs = ["message_handler.h"], hdrs = ["message_handler.h"], visibility = ["//visibility:public"], deps = [ ":address", ":common_type", ], ) cc_library( name = "timer", srcs = ["timer.h"], hdrs = ["timer.h"], visibility = ["//visibility:public"], deps = [ ":address", ":common_type", ], ) cc_library( name = "endpoint", srcs = ["endpoint.cc"], hdrs = ["endpoint.h"], visibility = ["//visibility:public"], deps = [ ":address", ":common_struct", ":message_handler", ":timer", "@com_github_enki_libev//:libev", "@com_github_google_glog//:glog", "@com_google_protobuf//:protobuf", "@openssl//:openssl", ], ) cc_library( name = "udp_socket_endpoint", srcs = ["udp_socket_endpoint.cc"], hdrs = ["udp_socket_endpoint.h"], visibility = ["//visibility:public"], deps = [ ":address", ":endpoint", "@com_github_enki_libev//:libev", "@com_google_protobuf//:protobuf", "@openssl//:openssl", ], ) cc_library( name = "utils", srcs = ["utils.cc"], hdrs = ["utils.h"], deps = [ ":udp_socket_endpoint", "@com_github_cameron314_concurrentqueue//:concurrentqueue", "@com_github_preshing_junction//:libjunction", "@com_github_gflags_gflags//:gflags", "@com_github_google_glog//:glog", "@openssl//:openssl", ], visibility = ["//visibility:public"], ) ================================================ FILE: lib/Rules.mk ================================================ d := $(dir $(lastword $(MAKEFILE_LIST))) SRCS += $(addprefix $(d), \ address.cc utils.cc udp_socket_endpoint.cc) LIB-address := $(o)address.o LIB-utils := $(o)utils.o LIB-udp-socket := $(o)udp_socket_endpoint.o $(LIB-address) $(LIB-utils) $(info LIB-udp-socket is $(LIB-udp-socket)) # include $(d)tests/Rules.mk ================================================ FILE: lib/address.cc ================================================ #include "lib/address.h" Address::Address() : ip_(""), port_(-1), mac_("") { bzero(&addr_, sizeof(addr_)); } Address::Address(const std::string& ip, const int port, const std::string& mac) : ip_(ip), port_(port), mac_(mac) { bzero(&addr_, sizeof(addr_)); addr_.sin_family = AF_INET; addr_.sin_port = htons(port); addr_.sin_addr.s_addr = inet_addr(ip.c_str()); } Address::~Address() {} std::string Address::GetIPAsString() { ip_ = inet_ntoa(addr_.sin_addr); return ip_; } int Address::GetPortAsInt() { port_ = htons(addr_.sin_port); return port_; } ================================================ FILE: lib/address.h ================================================ #ifndef NEZHA_ADDRESS #define NEZHA_ADDRESS #include #include #include #include #include #include #include #define UDP_BUFFER_SIZE (512) /** * The address of an endpoint is encapsulate as the Address Class. * Now it mainly includes the socket-based information, such as ip and port, but * we reserves the future possibility to extend to support other communication * primitives, such as DPDK */ class Address { public: std::string ip_; int port_; std::string mac_; // For future extension (DPDK) struct sockaddr_in addr_; Address(); Address(const Address& addr) : ip_(addr.ip_), port_(addr.port_), mac_(addr.mac_) { memcpy(&addr_, &(addr.addr_), sizeof(struct sockaddr_in)); } Address(const std::string& ip, const int port, const std::string& mac = ""); ~Address(); std::string GetIPAsString(); int GetPortAsInt(); }; #endif ================================================ FILE: lib/common_struct.h ================================================ #ifndef NEZHA_COMMON_STRUCT_H #define NEZHA_COMMON_STRUCT_H #include #include #include #include #include #include #include "lib/common_type.h" /** * Nezha relies on proto messages to communicate. * When the proto message has been serialized and is about to be sent by the * endpoint, MessageHeader is prepended to the head of the proto message (refer * to SendMsgTo in udp_socket_endpoint.h), which describes the type of proto * message and its length. In this way, when the receiver endpoint receives the * message, it can know the type and length of the proto message, then it can * choose the proper way to deserialize it. */ struct MessageHeader { char msgType; uint32_t msgLen; MessageHeader(const char t, const uint32_t l) : msgType(t), msgLen(l) {} }; /** * SHA_HASH is included in the FastReply message to represent the replica state * of replica. More details at Sec 5.2 of our paper * https://arxiv.org/pdf/2206.03285.pdf */ union SHA_HASH { uint32_t item[5]; unsigned char hash[SHA_DIGEST_LENGTH]; SHA_HASH() { memset(item, 0, sizeof(uint32_t) * 5); } SHA_HASH(const char* str, const uint32_t len) { if (len >= SHA_DIGEST_LENGTH) { memcpy(hash, str, SHA_DIGEST_LENGTH); } else { memcpy(hash, str, len); } } SHA_HASH(const SHA_HASH& h) { memcpy(item, h.item, sizeof(uint32_t) * 5); } SHA_HASH& operator=(const SHA_HASH& sh) { memcpy(item, sh.item, sizeof(uint32_t) * 5); return *this; } void XOR(const SHA_HASH& h) { item[0] ^= h.item[0]; item[1] ^= h.item[1]; item[2] ^= h.item[2]; item[3] ^= h.item[3]; item[4] ^= h.item[4]; } std::string toString() { return (std::to_string(item[0]) + "-" + std::to_string(item[1]) + "-" + std::to_string(item[2]) + "-" + std::to_string(item[3]) + "-" + std::to_string(item[4])); } }; /** When request is received by the replica, it will be first converted to * RequestBody, which includes all the useful information of the request */ struct RequestBody { uint64_t deadline; uint64_t reqKey; // reqKey uniquely identifies the request on this replica, // it is concated by the clientId and reqId. With reqKey, // the replica can easily check whether this request has // been previously received or not. uint32_t opKey; // opKey indicates which key the request is operating on ( // imagine we are working on a database system and different // requests wil read/write different keys). opKey is // important for commutativity optimization. dd uint64_t proxyId; // proxyId indicates which proxy delivers the request to // the replica, and later replicas will send the // corresponding reply to the proxy. std::string command; // command is the content to execute bool isWrite; RequestBody() {} RequestBody(const uint64_t d, const uint64_t r, const uint32_t ok, const uint64_t p, const std::string& cmd, const bool isw) : deadline(d), reqKey(r), opKey(ok), proxyId(p), command(cmd), isWrite(isw) {} /** The following methods are used to compare different requests so as to * decide their order*/ bool LessThan(const RequestBody& bigger) { return (deadline < bigger.deadline || (deadline == bigger.deadline && reqKey < bigger.reqKey)); } bool LessThan(const std::pair& bigger) { return (deadline < bigger.first || (deadline == bigger.first && reqKey < bigger.second)); } bool LessOrEqual(const RequestBody& bigger) { return (deadline < bigger.deadline || (deadline == bigger.deadline && reqKey <= bigger.reqKey)); } bool LessOrEqual(const std::pair& bigger) { return (deadline < bigger.first || (deadline == bigger.first && reqKey <= bigger.second)); } }; /** * After RequestBody is processed and eventually replied, it will be converted * into a LogEntry, and stored in the replica. * LogEntry, compares with RequestBody, includes more information */ struct LogEntry { // Request Body RequestBody body; SHA_HASH entryHash; // The hash value of this **single** entry SHA_HASH logHash; // The accumulative hash, which is calculated based on all // the log entries from the beginning to this entry /** prevNonCommutative and nextNonCommutative organize the LogEntries as a * skiplist, and easier and more efficient to traverse/modify/delete */ LogEntry* prevNonCommutative; // The previous non-commutative entry LogEntry* nextNonCommutative; // The next non-commutative entry LogEntry* prevNonCommutativeWrite; // The entry's prevNonCommutative may be a // write, or may be a read // But only the prevNonCommutativeWrite is used to calculate the incremental // hash, see Sec 8.2 of Nezha's Technical Report LogEntry* nextNonCommutativeWrite; /** prev and next organizes the LogEntries as a link list, and easier to * traverse/modify/delete */ LogEntry* prev; // The previous LogEntry pointer LogEntry* next; // The next LogEntry pointer std::string result; // The execution result of the LogEntry char status; // uint32_t logId; // The logId (the position of the LogEntry in the list) of // the entry LogEntry() : prevNonCommutative(NULL), nextNonCommutative(NULL), prevNonCommutativeWrite(NULL), nextNonCommutativeWrite(NULL), prev(NULL), next(NULL), result(""), status(EntryStatus::INITIAL), logId(0) {} LogEntry(const RequestBody& rb, const SHA_HASH& eh, const SHA_HASH& h, LogEntry* prevNonComm = NULL, LogEntry* nextNonComm = NULL, LogEntry* preNonCOmmW = NULL, LogEntry* nextNonCommW = NULL, LogEntry* pre = NULL, LogEntry* nxt = NULL, const std::string& re = "", const char sts = EntryStatus::INITIAL, const uint32_t lid = 0) : body(rb), entryHash(eh), logHash(h), prevNonCommutative(prevNonComm), nextNonCommutative(nextNonComm), prevNonCommutativeWrite(preNonCOmmW), nextNonCommutativeWrite(nextNonCommW), prev(pre), next(nxt), result(re), status(sts), logId(lid) {} LogEntry(const uint64_t d, const uint64_t r, const uint32_t ok, const uint64_t p, const std::string& cmd, const bool& isw, const SHA_HASH& eh, const SHA_HASH& h, LogEntry* prevNonComm = NULL, LogEntry* nextNonComm = NULL, LogEntry* preNonCOmmW = NULL, LogEntry* nextNonCommW = NULL, LogEntry* pre = NULL, LogEntry* nxt = NULL, const std::string& re = "", const char sts = EntryStatus::INITIAL, const uint32_t lid = 0) : body(d, r, ok, p, cmd, isw), entryHash(eh), logHash(h), prevNonCommutative(prevNonComm), nextNonCommutative(nextNonComm), prevNonCommutativeWrite(preNonCOmmW), nextNonCommutativeWrite(nextNonCommW), prev(pre), next(nxt), result(re), status(sts), logId(lid) {} bool LessThan(const LogEntry& bigger) { return body.LessThan(bigger.body); } bool LessThan(const std::pair& bigger) { return body.LessThan(bigger); } bool LessOrEqual(const LogEntry& bigger) { return body.LessOrEqual(bigger.body); } bool LessOrEqual(const std::pair& bigger) { return body.LessOrEqual(bigger); } }; /** * CrashVectorStruct is necessary for Nezha to avoid stray messages, details in * Appendix A.1 and Appendix J of our paper */ struct CrashVectorStruct { std::vector cv_; uint32_t version_; // Newer crash vector will have a larger version_ SHA_HASH cvHash_; CrashVectorStruct(const std::vector& c, const uint32_t v) : cv_(c), version_(v) { const uint32_t contentLen = c.size() * sizeof(uint32_t); const unsigned char* content = (const unsigned char*)(void*)(c.data()); SHA1(content, contentLen, cvHash_.hash); } CrashVectorStruct(const CrashVectorStruct& c) : cv_(c.cv_), version_(c.version_), cvHash_(c.cvHash_) {} }; #endif ================================================ FILE: lib/common_type.h ================================================ #ifndef NEZHA_COMMON_TYPE_H #define NEZHA_COMMON_TYPE_H /** We currently only support UDP endpoint, and GRPC endpoint will be supported * in the near future*/ enum EndpointType { UDP_ENDPOINT = 1, GRPC_ENDPOINT // To be supported }; /** Refer to Sec 5 of our paper for detailed explanation of different replica * statuses */ enum ReplicaStatus { NORMAL = 1, VIEWCHANGE, RECOVERING, TERMINATED }; /** A LogEntry is INITIAL at the beginning, then it may switch to either * IN_PROCESS->PROCESSED->REPLIED or directly IN_LATEBUFFER */ enum EntryStatus { INITIAL = 1, IN_PROCESS, IN_LATEBUFFER, PROCESSED, TO_SLOW_REPLY, REPLIED }; /** * The message types are defined according to the proto files and the * information will be included in each message to facilitate * serialize/deserialize proto messages */ enum MessageType { CLIENT_REQUEST = 1, LEADER_REQUEST, SYNC_INDEX, MISSED_INDEX_ASK, MISSED_REQ_ASK, FAST_REPLY, SLOW_REPLY, COMMIT_REPLY, MISSED_REQ, VIEWCHANGE_REQ, VIEWCHANGE_MSG, START_VIEW, STATE_TRANSFER_REQUEST, STATE_TRANSFER_REPLY, CRASH_VECTOR_REQUEST, CRASH_VECTOR_REPLY, RECOVERY_REQUEST, RECOVERY_REPLY, SYNC_STATUS_REPORT, COMMIT_INSTRUCTION, SUSPEND_REPLY, ERROR_MSG }; #endif ================================================ FILE: lib/endpoint.cc ================================================ #include "lib/endpoint.h" Endpoint::Endpoint(const std::string& sip, const int sport, const bool isMasterReceiver) : addr_(sip, sport) { evLoop_ = isMasterReceiver ? ev_default_loop() : ev_loop_new(); if (!evLoop_) { LOG(ERROR) << "Event Loop error"; return; } } Endpoint::~Endpoint() { LoopBreak(); ev_loop_destroy(evLoop_); } bool Endpoint::RegisterTimer(Timer* timer) { if (evLoop_ == NULL) { LOG(ERROR) << "No evLoop!"; return false; } if (isTimerRegistered(timer)) { LOG(ERROR) << "This timer has already been registered"; return false; } timer->attachedEndpoint_ = this; eventTimers_.insert(timer); ev_timer_again(evLoop_, timer->evTimer_); return true; } bool Endpoint::UnRegisterTimer(Timer* timer) { if (evLoop_ == NULL) { LOG(ERROR) << "No evLoop!"; return false; } if (!isTimerRegistered(timer)) { LOG(ERROR) << "The timer has not been registered "; return false; } ev_timer_stop(evLoop_, timer->evTimer_); eventTimers_.erase(timer); return true; } void Endpoint::UnRegisterAllTimers() { for (auto& t : eventTimers_) { ev_timer_stop(evLoop_, t->evTimer_); } eventTimers_.clear(); } bool Endpoint::isTimerRegistered(Timer* timer) { return (eventTimers_.find(timer) != eventTimers_.end()); } void Endpoint::LoopRun() { ev_run(evLoop_, 0); } void Endpoint::LoopBreak() { UnRegisterAllTimers(); ev_break(evLoop_, EVBREAK_ALL); } ================================================ FILE: lib/endpoint.h ================================================ #ifndef NEZHA_ENDPOINT_H #define NEZHA_ENDPOINT_H #include #include #include #include #include #include #include #include #include #include "lib/address.h" #include "lib/common_struct.h" #include "lib/message_handler.h" #include "lib/timer.h" /** * Endpoint is the basic abstraction, and it can be derived to more specific * endpoints, based on the communication primtive (e.g., UDPSocketEndpoint) * * An Endpoint supports three major functionalities: * (1) Send/Receive messages; * (2) Process the received messages according to (pre-registered) customized * message handlers; * (3) Conduct periodical actions according to (pre-registered) * customized timer functions. */ class Endpoint { protected: /* The address of this endpoint */ Address addr_; /** The socket fd it uses to send/recv messages */ int fd_; /** The ev_loop struct from libev, which uses to handle io/timer events */ struct ev_loop* evLoop_; /** One endpoint can have multiple timers registered. We maintain a set to * avoid duplicate registration and check whether a specific timer has been * registered or not.*/ std::set eventTimers_; public: int epId_; // The id of the endpoint, mainly for debug /** The endpoint accepts an ip and port. If both are valid, it binds the * socket fd to the ip:port. If isMasterReceiver is true, it creates the * default loop with libev, otherwise, it creates new loop (refer to libev * documentation for detailed explanation at * https://metacpan.org/dist/EV/view/libev/ev.pod) */ Endpoint(const std::string& ip = "", const int port = -1, const bool isMasterReceiver = false); virtual ~Endpoint(); /** Send the message to the specific destination. The method needs to know the * message type (3rd parameter) and include such information in the buffer */ virtual int SendMsgTo(const Address& dstAddr, const google::protobuf::Message& msg, const char msgType) = 0; /** An endpoint potentially can have multiple message handlers registered, but * our UDPSocketEndpoint implementation only supports at most one * message handler for one endpoint. So we make them as virtual functions and * different derived classes have their own implementation of the methods */ virtual bool RegisterMsgHandler(MessageHandler* msgHdl) = 0; virtual bool UnRegisterMsgHandler(MessageHandler* msgHdl) = 0; virtual bool isMsgHandlerRegistered(MessageHandler* msgHdl) = 0; virtual void UnRegisterAllMsgHandlers() = 0; /** Return true if the timer is successfully registered, otherwise (e.g. it * has been registered before and has not been unreigstered), return false */ bool RegisterTimer(Timer* timer); /** Return true if the timer is successfully registered, otherwise (e.g. the * timer has not been registered before), return false */ bool UnRegisterTimer(Timer* timer); /** Check whether the timer has been registered */ bool isTimerRegistered(Timer* timer); void UnRegisterAllTimers(); void LoopRun(); void LoopBreak(); }; #endif ================================================ FILE: lib/message_handler.h ================================================ #ifndef NEZHA_MESSAGE_HANDLER_H #define NEZHA_MESSAGE_HANDLER_H #include #include #include #include #include #include #include #include #include #include "lib/address.h" #include "lib/common_type.h" /** * MessageHandler is an encapsulation of libev-based message handler (i.e. * ev_io). * * After the message handler is created, it will be registered to a * specific endpoint. Then, the callback func (i.e., MessageHandlerFunc) will be * called every time this endpoint receives some messages. * * Currently, we only support UDP communication. Therefore, we only have one * derived struct (UDPMsgHandler) from MessageHandler * * We will continue to support other types of endpoints. Correspondingly, there * will be more derived struct added later * **/ /** * Para-1: MessageHeader* describes the type and length of the received message * Para-2: char* is the payload of the message * Para-3: Address* is the address of the sender * Para-4: void* points to the (optional) context that is needed by the callback * function(i.e., MessageHandlerFunc) */ typedef std::function MessageHandlerFunc; struct MessageHandler { MessageHandlerFunc msgHandler_; void* context_; Address sender_; struct ev_io* evWatcher_; MessageHandler(MessageHandlerFunc msghdl, void* ctx = NULL) : msgHandler_(msghdl), context_(ctx) { evWatcher_ = new ev_io(); evWatcher_->data = (void*)this; } ~MessageHandler() { delete evWatcher_; } }; struct UDPMsgHandler : MessageHandler { char buffer_[UDP_BUFFER_SIZE]; UDPMsgHandler(MessageHandlerFunc msghdl, void* ctx = NULL) : MessageHandler(msghdl, ctx) { ev_init(evWatcher_, [](struct ev_loop* loop, struct ev_io* w, int revents) { UDPMsgHandler* m = (UDPMsgHandler*)(w->data); socklen_t sockLen = sizeof(struct sockaddr_in); int msgLen = recvfrom(w->fd, m->buffer_, UDP_BUFFER_SIZE, 0, (struct sockaddr*)(&(m->sender_.addr_)), &sockLen); if (msgLen > 0 && (uint32_t)msgLen > sizeof(MessageHeader)) { MessageHeader* msgHeader = (MessageHeader*)(void*)(m->buffer_); if (msgHeader->msgLen + sizeof(MessageHeader) >= (uint32_t)msgLen) { m->msgHandler_(msgHeader, m->buffer_ + sizeof(MessageHeader), &(m->sender_), m->context_); } } }); } ~UDPMsgHandler() {} }; #endif ================================================ FILE: lib/message_type.cc ================================================ #include "lib/message_type.h" namespace MessageType { char CLIENT_REQUEST = 1; char LEADER_REQUEST = 2; char SYNC_INDEX = 3; char MISSED_INDEX_ASK = 4; char MISSED_REQ_ASK = 5; char FAST_REPLY = 6; char SLOW_REPLY = 7; char COMMIT_REPLY = 8; char MISSED_REQ = 9; char VIEWCHANGE_REQ = 10; char VIEWCHANGE = 11; char START_VIEW = 12; char STATE_TRANSFER_REQUEST = 13; char STATE_TRANSFER_REPLY = 14; char CRASH_VECTOR_REQUEST = 15; char CRASH_VECTOR_REPLY = 16; char RECOVERY_REQUEST = 17; char RECOVERY_REPLY = 18; char SYNC_STATUS_REPORT = 19; char COMMIT_INSTRUCTION = 20; char SUSPEND_REPLY = 21; char ERROR_MSG = 22; }; ================================================ FILE: lib/message_type.h ================================================ #include #ifndef NEZHA_MESSAGE_TYPE_H #define NEZHA_MESSAGE_TYPE_H #define CONCURRENT_MAP_START_INDEX (2u) #define CONCAT_UINT32(a, b) ((((uint64_t)a)<<32u)|(uint32_t)b) #define HIGH_32BIT(a) ((uint32_t)(a>>32)) #define LOW_32BIT(a) ((uint32_t)a) struct MessageHeader { char msgType; uint32_t msgLen; MessageHeader(const char t, const uint32_t l) :msgType(t), msgLen(l) {} }; namespace MessageType { extern char CLIENT_REQUEST; extern char LEADER_REQUEST; extern char SYNC_INDEX; extern char MISSED_INDEX_ASK; extern char MISSED_REQ_ASK; extern char FAST_REPLY; extern char SLOW_REPLY; extern char COMMIT_REPLY; extern char MISSED_REQ; extern char VIEWCHANGE_REQ; extern char VIEWCHANGE; extern char START_VIEW; extern char STATE_TRANSFER_REQUEST; extern char STATE_TRANSFER_REPLY; extern char CRASH_VECTOR_REQUEST; extern char CRASH_VECTOR_REPLY; extern char RECOVERY_REQUEST; extern char RECOVERY_REPLY; extern char SYNC_STATUS_REPORT; extern char COMMIT_INSTRUCTION; extern char SUSPEND_REPLY; extern char ERROR_MSG; }; #endif ================================================ FILE: lib/timer.h ================================================ #ifndef NEZHA_TIMER_ #define NEZHA_TIMER_ #include #include #include #include #include #include #include #include #include #include "lib/address.h" #include "lib/common_type.h" /** * Timer is an encapsulation of libev-based message handler (i.e. * ev_timer). * * After the timer is created, it will be registered to a * specific endpoint, together with a period (measures in milliseconds). Then, * the callback func (i.e., TimerFunc) will be called periodically until the * timer is unregistered * **/ /** * Para-1: The first void* points to the context, that may be needed by the * callback function(i.e., TimerFunc) * Para-2: The first void* points to the endpoint that this timer is attached * to. It can be passed into the function as NULL if the TimerFunc does not need * it. But some TimerFunc (e.g., monitorTimer in replica) callback needs to know * the endpoint it has attached to. */ typedef std::function TimerFunc; struct Timer { std::function timerFunc_; void* context_; void* attachedEndpoint_; struct ev_timer* evTimer_; Timer(TimerFunc timerf, uint32_t periodMs = 1, void* ctx = NULL, void* aep = NULL) : timerFunc_(timerf), context_(ctx), attachedEndpoint_(aep) { evTimer_ = new ev_timer(); evTimer_->data = (void*)this; evTimer_->repeat = periodMs * 1e-3; ev_init(evTimer_, [](struct ev_loop* loop, struct ev_timer* w, int revents) { Timer* t = (Timer*)(w->data); t->timerFunc_(t->context_, t->attachedEndpoint_); }); } ~Timer() { delete evTimer_; } }; #endif ================================================ FILE: lib/udp_socket_endpoint.cc ================================================ #include "lib/udp_socket_endpoint.h" UDPSocketEndpoint::UDPSocketEndpoint(const std::string& ip, const int port, const bool isMasterReceiver) : Endpoint(ip, port, isMasterReceiver), msgHandler_(NULL) { fd_ = socket(PF_INET, SOCK_DGRAM, 0); if (fd_ < 0) { LOG(ERROR) << "Receiver Fd fail "; return; } // Set Non-Blocking int status = fcntl(fd_, F_SETFL, fcntl(fd_, F_GETFL, 0) | O_NONBLOCK); if (status < 0) { LOG(ERROR) << " Set NonBlocking Fail"; } if (ip == "" || port < 0) { return; } struct sockaddr_in addr; bzero(&addr, sizeof(addr)); addr.sin_family = AF_INET; addr.sin_port = htons(port); addr.sin_addr.s_addr = inet_addr(ip.c_str()); // Bind socket to Address int bindRet = bind(fd_, (struct sockaddr*)&addr, sizeof(addr)); if (bindRet != 0) { LOG(ERROR) << "bind error\t" << bindRet << "\t port=" << port; return; } } UDPSocketEndpoint::~UDPSocketEndpoint() {} int UDPSocketEndpoint::SendMsgTo(const Address& dstAddr, const google::protobuf::Message& msg, char msgType) { char buffer[UDP_BUFFER_SIZE]; MessageHeader* msgHdr = (MessageHeader*)(void*)buffer; msgHdr->msgType = msgType; std::string serializedString = msg.SerializeAsString(); msgHdr->msgLen = serializedString.length(); if (serializedString.length() + sizeof(MessageHeader) > UDP_BUFFER_SIZE) { LOG(ERROR) << "Msg too large " << (uint32_t)msgType << "\t length=" << serializedString.length(); return -1; } if (msgHdr->msgLen > 0) { // Serialization succeed // Prepend MesageHeader to the serialized string memcpy(buffer + sizeof(MessageHeader), serializedString.c_str(), msgHdr->msgLen); int ret = sendto(fd_, buffer, msgHdr->msgLen + sizeof(MessageHeader), 0, (struct sockaddr*)(&(dstAddr.addr_)), sizeof(sockaddr_in)); if (ret < 0) { VLOG(1) << pthread_self() << "\tSend Fail ret =" << ret; } return ret; } return -1; } bool UDPSocketEndpoint::RegisterMsgHandler(MessageHandler* msgHdl) { UDPMsgHandler* udpMsgHdl = (UDPMsgHandler*)msgHdl; if (evLoop_ == NULL) { LOG(ERROR) << "No evLoop!"; return false; } if (isMsgHandlerRegistered(msgHdl)) { LOG(ERROR) << "This msgHdl has already been registered"; return false; } msgHandler_ = udpMsgHdl; ev_io_set(udpMsgHdl->evWatcher_, fd_, EV_READ); ev_io_start(evLoop_, udpMsgHdl->evWatcher_); return true; } bool UDPSocketEndpoint::UnRegisterMsgHandler(MessageHandler* msgHdl) { UDPMsgHandler* udpMsgHdl = (UDPMsgHandler*)msgHdl; if (evLoop_ == NULL) { LOG(ERROR) << "No evLoop!"; return false; } if (!isMsgHandlerRegistered(udpMsgHdl)) { LOG(ERROR) << "The handler has not been registered "; return false; } ev_io_stop(evLoop_, udpMsgHdl->evWatcher_); msgHandler_ = NULL; return true; } bool UDPSocketEndpoint::isMsgHandlerRegistered(MessageHandler* msgHdl) { return (UDPMsgHandler*)msgHdl == msgHandler_; } void UDPSocketEndpoint::UnRegisterAllMsgHandlers() { ev_io_stop(evLoop_, msgHandler_->evWatcher_); msgHandler_ = NULL; } ================================================ FILE: lib/udp_socket_endpoint.h ================================================ #ifndef NEZHA_UDP_SOCKET_SENDER_H #define NEZHA_UDP_SOCKET_SENDER_H #include "lib/endpoint.h" class UDPSocketEndpoint : public Endpoint { private: /* data */ struct UDPMsgHandler* msgHandler_; public: UDPSocketEndpoint(const std::string& ip = "", const int port = -1, const bool isMasterReceiver = false); ~UDPSocketEndpoint(); int SendMsgTo(const Address& dstAddr, const google::protobuf::Message& msg, const char msgType) override; bool RegisterMsgHandler(MessageHandler* msgHdl) override; bool UnRegisterMsgHandler(MessageHandler* msgHdl) override; bool isMsgHandlerRegistered(MessageHandler* msgHdl) override; void UnRegisterAllMsgHandlers() override; }; #endif ================================================ FILE: lib/utils.cc ================================================ #include "lib/utils.h" SHA_HASH CalculateHash(uint64_t deadline, uint64_t reqKey) { SHA_HASH hash; const uint32_t contentLen = sizeof(uint64_t) + sizeof(uint32_t) + sizeof(uint32_t); unsigned char content[contentLen]; memcpy(content, &deadline, sizeof(uint64_t)); memcpy(content + sizeof(uint64_t), &reqKey, sizeof(uint64_t)); SHA1(content, contentLen, hash.hash); return hash; } // Get Current Microsecond Timestamp uint64_t GetMicrosecondTimestamp() { auto tse = std::chrono::system_clock::now().time_since_epoch(); return std::chrono::duration_cast(tse).count(); } Endpoint* CreateEndpoint(const char endpointType, const std::string& sip, const int sport, const bool isMasterReceiver) { if (endpointType == EndpointType::UDP_ENDPOINT) { return new UDPSocketEndpoint(sip, sport, isMasterReceiver); } else if (endpointType == EndpointType::GRPC_ENDPOINT) { // To support GRPC later return NULL; } else { LOG(ERROR) << "Unknown endpoint type: " << endpointType; return NULL; } } MessageHandler* CreateMsgHandler(const char endpointType, MessageHandlerFunc msghdl, void* ctx) { if (endpointType == EndpointType::UDP_ENDPOINT) { return new UDPMsgHandler(msghdl, ctx); } else if (endpointType == EndpointType::GRPC_ENDPOINT) { // To support GRPC later return NULL; } else { LOG(ERROR) << "Unknown endpoint type: " << endpointType; return NULL; } } ================================================ FILE: lib/utils.h ================================================ #ifndef NEZHA_UTILS_H #define NEZHA_UTILS_H #include #include #include #include #include #include #include #include #include #include #include #include #include "concurrentqueue.h" #include "gflags/gflags.h" #include "lib/udp_socket_endpoint.h" template using ConcurrentQueue = moodycamel::ConcurrentQueue; template using ConcurrentMap = junction::ConcurrentMap_Leapfrog; /** The concurrent map we used (i.e.junction::ConcurrentMap) reserves 0 and 1 , * so the start value should be 2 */ #define CONCURRENT_MAP_START_INDEX (2u) #define CONCAT_UINT32(a, b) ((((uint64_t)a) << 32u) | (uint32_t)b) /** Get the high/low 32bits of a uint64 */ #define HIGH_32BIT(a) ((uint32_t)(a >> 32)) #define LOW_32BIT(a) ((uint32_t)a) // Since is sufficient to uniquely identify one request, we // calculate hash based on them to represent the corresponding request/log SHA_HASH CalculateHash(uint64_t deadline, uint64_t reqKey); // Get Current Microsecond Timestamp uint64_t GetMicrosecondTimestamp(); // Factory function, to create different types of endpoints and msghandlers Endpoint* CreateEndpoint(const char endpointType, const std::string& sip = "", const int sport = -1, const bool isMasterReceiver = false); MessageHandler* CreateMsgHandler( const char endpointType, std::function msghdl, void* ctx = NULL); #endif ================================================ FILE: lib/zipfian.h ================================================ /* * MIT License * * Copyright (c) 2017 Lucas Lersch * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ /* Implementation derived from: * "Quickly Generating Billion-Record Synthetic Databases", Jim Gray et al, * SIGMOD 1994 */ /* * The zipfian_int_distribution class is intended to be compatible with other * distributions introduced in #include by the C++11 standard. * * Usage example: * #include * #include "zipfian_int_distribution.h" * int main() * { * std::default_random_engine generator; * zipfian_int_distribution distribution(1, 10, 0.99); * int i = distribution(generator); * } */ /* * IMPORTANT: constructing the distribution object requires calculating the zeta * value which becomes prohibetively expensive for very large ranges. As an * alternative for such cases, the user can pass the pre-calculated values and * avoid the calculation every time. * * Usage example: * #include * #include "zipfian_int_distribution.h" * int main() * { * std::default_random_engine generator; * zipfian_int_distribution::param_type p(1, 1e6, 0.99, 27.000); * zipfian_int_distribution distribution(p); * int i = distribution(generator); * } */ #include #include #include #include template class zipfian_int_distribution { static_assert(std::is_integral<_IntType>::value, "Template argument not an integral type."); public: /** The type of the range of the distribution. */ typedef _IntType result_type; /** Parameter type. */ struct param_type { typedef zipfian_int_distribution<_IntType> distribution_type; explicit param_type(_IntType __a = 0, _IntType __b = std::numeric_limits<_IntType>::max(), double __theta = 0.99) : _M_a(__a), _M_b(__b), _M_theta(__theta), _M_zeta(zeta(_M_b - _M_a + 1, __theta)), _M_zeta2theta(zeta(2, __theta)) { assert(_M_a <= _M_b && _M_theta > 0.0 && _M_theta < 1.0); } explicit param_type(_IntType __a, _IntType __b, double __theta, double __zeta) : _M_a(__a), _M_b(__b), _M_theta(__theta), _M_zeta(__zeta), _M_zeta2theta(zeta(2, __theta)) { __glibcxx_assert(_M_a <= _M_b && _M_theta > 0.0 && _M_theta < 1.0); } result_type a() const { return _M_a; } result_type b() const { return _M_b; } double theta() const { return _M_theta; } double zeta() const { return _M_zeta; } double zeta2theta() const { return _M_zeta2theta; } friend bool operator==(const param_type& __p1, const param_type& __p2) { return __p1._M_a == __p2._M_a && __p1._M_b == __p2._M_b && __p1._M_theta == __p2._M_theta && __p1._M_zeta == __p2._M_zeta && __p1._M_zeta2theta == __p2._M_zeta2theta; } private: _IntType _M_a; _IntType _M_b; double _M_theta; double _M_zeta; double _M_zeta2theta; /** * @brief Calculates zeta. * * @param __n [IN] The size of the domain. * @param __theta [IN] The skew factor of the distribution. */ double zeta(unsigned long __n, double __theta) { double ans = 0.0; for (unsigned long i = 1; i <= __n; ++i) ans += std::pow(1.0 / i, __theta); return ans; } }; public: /** * @brief Constructs a zipfian_int_distribution object. * * @param __a [IN] The lower bound of the distribution. * @param __b [IN] The upper bound of the distribution. * @param __theta [IN] The skew factor of the distribution. */ explicit zipfian_int_distribution(_IntType __a = _IntType(0), _IntType __b = _IntType(1), double __theta = 0.99) : _M_param(__a, __b, __theta) { } explicit zipfian_int_distribution(const param_type& __p) : _M_param(__p) { } /** * @brief Resets the distribution state. * * Does nothing for the zipfian int distribution. */ void reset() { } result_type a() const { return _M_param.a(); } result_type b() const { return _M_param.b(); } double theta() const { return _M_param.theta(); } /** * @brief Returns the parameter set of the distribution. */ param_type param() const { return _M_param; } /** * @brief Sets the parameter set of the distribution. * @param __param The new parameter set of the distribution. */ void param(const param_type& __param) { _M_param = __param; } /** * @brief Returns the inclusive lower bound of the distribution range. */ result_type min() const { return this->a(); } /** * @brief Returns the inclusive upper bound of the distribution range. */ result_type max() const { return this->b(); } /** * @brief Generating functions. */ template result_type operator()(_UniformRandomNumberGenerator& __urng) { return this->operator()(__urng, _M_param); } template result_type operator()(_UniformRandomNumberGenerator& __urng, const param_type& __p) { double alpha = 1 / (1 - __p.theta()); double eta = (1 - std::pow(2.0 / (__p.b() - __p.a() + 1), 1 - __p.theta())) / (1 - __p.zeta2theta() / __p.zeta()); double u = std::generate_canonical::digits, _UniformRandomNumberGenerator>(__urng); double uz = u * __p.zeta(); if (uz < 1.0) return __p.a(); if (uz < 1.0 + std::pow(0.5, __p.theta())) return __p.a() + 1; return __p.a() + ((__p.b() - __p.a() + 1) * std::pow(eta * u - eta + 1, alpha)); } /** * @brief Return true if two zipfian int distributions have * the same parameters. */ friend bool operator==(const zipfian_int_distribution& __d1, const zipfian_int_distribution& __d2) { return __d1._M_param == __d2._M_param; } private: param_type _M_param; }; ================================================ FILE: license.md ================================================ MIT License Copyright (c) 2022-2024 Jinkun Geng Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: micro-bench/BUILD ================================================ cc_binary( name = "bench_sender", srcs = ["bench_sender.cc"], deps = [ "//proto:nezha_cc_proto", "//lib:utils", "//lib:address", "//lib:zipfian", ], copts = [ "-I/usr/local/include" ], linkopts = [ "-L/usr/local/lib", "-lev", "-ldl", "-lturf", "-ljunction", "-lcrypto", "-lgflags", "-lglog", "-lyaml-cpp", "-pthread" ], ) cc_binary( name = "bench_receiver", srcs = ["bench_receiver.cc"], deps = [ "//proto:nezha_cc_proto", "//lib:utils", "//lib:address", "//lib:zipfian", ], copts = [ "-I/usr/local/include" ], linkopts = [ "-L/usr/local/lib", "-lev", "-ldl", "-lturf", "-ljunction", "-lcrypto", "-lgflags", "-lglog", "-lyaml-cpp", "-pthread" ], ) cc_binary( name = "analysis", srcs = ["analysis.cc"], deps = [ "//proto:nezha_cc_proto", "//lib:utils", "//lib:address", "//lib:zipfian", ], copts = [ "-I/usr/local/include" ], linkopts = [ "-L/usr/local/lib", "-lev", "-ldl", "-lturf", "-ljunction", "-lcrypto", "-lgflags", "-lglog", "-lyaml-cpp", "-pthread" ], ) ================================================ FILE: micro-bench/analysis.cc ================================================ #include #include #include "lib/utils.h" #include "lib/zipfian.h" #include "proto/nezha_proto.pb.h" DEFINE_string(folder, "/home/steam1994/micro-stats/2-10000-0-50", "The folder of the csv"); DEFINE_int32(replica_num, 2, "The number of replicas"); int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, true); google::InitGoogleLogging(argv[0]); FLAGS_logtostderr = 1; // std::vector zipfianKeys; // uint32_t keyNum = 1000000; // zipfianKeys.resize(1000000, 0); // uint32_t skewFactor = 0.5; // if (keyNum > 1) { // std::default_random_engine generator(1); // clientId as the seed // zipfian_int_distribution zipfianDistribution(0, keyNum - 1, // skewFactor); // for (uint32_t i = 0; i < zipfianKeys.size(); i++) { // zipfianKeys[i] = zipfianDistribution(generator); // } // } std::string r0Fname = FLAGS_folder + "/" + "Replica-Stats-0.csv"; std::ifstream ifs1(r0Fname); LOG(INFO) << "fname=" << r0Fname; uint32_t clientId, reqId; uint32_t id = 0; std::map mapIdx; std::map mapKey; while (ifs1 >> clientId >> reqId) { uint64_t reqKey = CONCAT_UINT32(clientId, reqId); mapIdx[reqKey] = id; id++; } for (int i = 1; i < FLAGS_replica_num; i++) { std::string r1Fname = FLAGS_folder + "/" + "Replica-Stats-" + std::to_string(i) + ".csv"; std::ifstream ifs2(r1Fname); std::vector reqKeys; reqKeys.reserve(100000); std::vector mappedIds; mappedIds.reserve(100000); while (ifs2 >> clientId >> reqId) { uint64_t reqKey = CONCAT_UINT32(clientId, reqId); reqKeys.push_back(reqKey); mappedIds.push_back(mapIdx[reqKey]); } uint32_t reorderedCase = 0; for (uint32_t i = 1; i < reqKeys.size(); i++) { if (mappedIds[i] == 0 || mappedIds[i] < mappedIds[i - 1]) { reorderedCase++; } } LOG(INFO) << "reorderedCase=" << reorderedCase << "\t" << "total=" << id << "\t rate=" << reorderedCase * 1.0 / id; } } ================================================ FILE: micro-bench/bench_receiver.cc ================================================ #include #include #include "lib/utils.h" #include "lib/zipfian.h" #include "proto/nezha_proto.pb.h" DEFINE_string(receiver_ip, "127.0.0.1", "The ip address of the receiver"); DEFINE_int32(receiver_port, 33333, "The port of the receiver"); DEFINE_int32(replica_id, 1, "The id of the replica"); DEFINE_int32(enable_dom, 0, "Whether enable DOM"); DEFINE_int32(percentile, 50, "The percentile of the owd estimation"); DEFINE_int32(client_port, 33336, "The port of the client listens for OWD reply"); ConcurrentMap clientAddrs; ConcurrentQueue> owdQu; ConcurrentQueue processQu; std::vector> traceVec; void MsgHandlerFunc(MessageHeader* msgHeader, char* msgBuffer, Address* sender, void* context) { if (msgHeader->msgType == MessageType::CLIENT_REQUEST && msgHeader->msgLen > 0) { nezha::proto::Request request; if (request.ParseFromArray(msgBuffer, msgHeader->msgLen)) { if (clientAddrs.get(request.clientid()) == NULL) { Address* senderAddr = new Address(sender->GetIPAsString(), FLAGS_client_port); clientAddrs.assign(request.clientid(), senderAddr); } processQu.enqueue(request); uint64_t nowTime = GetMicrosecondTimestamp(); if (nowTime > request.sendtime()) { uint32_t owd = nowTime - request.sendtime(); owdQu.enqueue({request.clientid(), owd}); } } } } void ProcessTd() { traceVec.reserve(10000000ul); nezha::proto::Request request; std::map, nezha::proto::Request> earlyBuffer; uint64_t startTime = GetMicrosecondTimestamp(); LOG(INFO) << "FLAGS_enable_dom=" << FLAGS_enable_dom; while (true) { if (FLAGS_enable_dom == 1) { if (processQu.try_dequeue(request)) { uint64_t deadline = request.sendtime() + request.bound(); uint64_t reqKey = CONCAT_UINT32(request.clientid(), request.reqid()); earlyBuffer.insert({{deadline, reqKey}, request}); } uint64_t nowTime = GetMicrosecondTimestamp(); while (earlyBuffer.empty() == false && earlyBuffer.begin()->first.first <= nowTime) { traceVec.push_back({earlyBuffer.begin()->second.clientid(), earlyBuffer.begin()->second.reqid()}); earlyBuffer.erase(earlyBuffer.begin()); if (traceVec.size() >= 10000000ul) { break; } } } else { while (processQu.try_dequeue(request)) { traceVec.push_back({request.clientid(), request.reqid()}); if (traceVec.size() >= 10000000ul) { break; } } } uint64_t nowTime = GetMicrosecondTimestamp(); if (nowTime - startTime >= 60 * 1000ul * 1000ul || traceVec.size() >= 10000000ul) { LOG(INFO) << "To terminated ..." << traceVec.size(); std::ofstream ofs("Replica-Stats-" + std::to_string(FLAGS_replica_id) + ".csv"); // ofs << "ClientID,ReqID" << std::endl; for (auto& p : traceVec) { ofs << p.first << "\t" << p.second << std::endl; } ofs.close(); exit(0); } } } void OWDTd() { std::pair owdSample; std::map> owdMap; std::map owdCnt; UDPSocketEndpoint* replyEP = dynamic_cast( CreateEndpoint(EndpointType::UDP_ENDPOINT)); nezha::proto::Reply reply; reply.set_replicaid(FLAGS_replica_id); while (true) { if (owdQu.try_dequeue(owdSample)) { uint32_t senderId = owdSample.first; uint32_t owd = owdSample.second; if (owdMap.find(senderId) == owdMap.end()) { owdMap[senderId].resize(1000); owdCnt[senderId] = 0; } owdMap[senderId][owdCnt[senderId] % 1000] = owd; owdCnt[senderId]++; if (owdCnt[senderId] % 1000 == 0) { std::vector temp = owdMap[senderId]; sort(temp.begin(), temp.end()); uint32_t estimate = temp[1000 * FLAGS_percentile / 100]; reply.set_clientid(senderId); reply.set_owd(estimate + 10); // plus the 3 * error bound (sigma1+sigma2), the // sigma ranges 1-3, here we plus 10 to simulate it Address* clientAddr = clientAddrs.get(senderId); if (clientAddr) { // LOG(INFO) << "Send to " << senderId << "\t" << estimate; replyEP->SendMsgTo(*clientAddr, reply, MessageType::FAST_REPLY); } } } } } int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, true); google::InitGoogleLogging(argv[0]); FLAGS_logtostderr = 1; std::thread* processTd = new std::thread(ProcessTd); std::thread* owdTd = new std::thread(OWDTd); Endpoint* requestEP = CreateEndpoint( EndpointType::UDP_ENDPOINT, FLAGS_receiver_ip, FLAGS_receiver_port, true); UDPMsgHandler* msgHandler = new UDPMsgHandler(MsgHandlerFunc); requestEP->RegisterMsgHandler(msgHandler); requestEP->LoopRun(); processTd->join(); owdTd->join(); delete requestEP; delete processTd; delete owdTd; } ================================================ FILE: micro-bench/bench_sender.cc ================================================ #include #include #include "lib/utils.h" #include "lib/zipfian.h" #include "proto/nezha_proto.pb.h" DEFINE_string(receiver_1_ip, "127.0.0.1", "The ip address of the 1st receiver"); DEFINE_string(receiver_2_ip, "127.0.0.1", "The ip address of the 2nd receiver"); DEFINE_string(receiver_3_ip, "127.0.0.1", "The ip address of the 3rd receiver"); DEFINE_string(receiver_4_ip, "127.0.0.1", "The ip address of the 4th receiver"); DEFINE_string(receiver_5_ip, "127.0.0.1", "The ip address of the 5th receiver"); DEFINE_int32(receiver_1_port, 33333, "The port of the 1st receiver"); DEFINE_int32(receiver_2_port, 33333, "The port of the 2nd receiver"); DEFINE_int32(receiver_3_port, 33333, "The port of the 3rd receiver"); DEFINE_int32(receiver_4_port, 33333, "The port of the 4th receiver"); DEFINE_int32(receiver_5_port, 33333, "The port of the 5th receiver"); DEFINE_int32(receiver_num, 2, "The number of receivers to test"); DEFINE_string(client_ip, "127.0.0.1", "The ip address of the client"); DEFINE_int32(client_port, 33336, "The port of the client listens for OWD reply"); DEFINE_uint64(poisson_rate, 10000, "Request Per Second"); DEFINE_uint64(duration, 60, "Duration of the experiment"); DEFINE_uint64(client_id, 1, "Client ID"); std::vector latencyBounds; std::atomic bound; void ReplyHandlerFunc(MessageHeader* msgHeader, char* msgBuffer, Address* sender, void* context) { if (msgHeader->msgType == MessageType::FAST_REPLY && msgHeader->msgLen > 0) { nezha::proto::Reply reply; if (reply.ParseFromArray(msgBuffer, msgHeader->msgLen)) { // LOG(INFO) << "replyOWD " << reply.owd() << "\t" << reply.replicaid(); if (reply.owd() > 0 && reply.owd() < 200) { latencyBounds[reply.replicaid()] = reply.owd(); auto it = max_element(std::begin(latencyBounds), std::end(latencyBounds)); if (*it != bound) { bound.store(*it); } } } } } void OWDUpdate() { latencyBounds.resize(FLAGS_receiver_num, 80); bound = 80; UDPSocketEndpoint* replyEP = dynamic_cast(CreateEndpoint( EndpointType::UDP_ENDPOINT, FLAGS_client_ip, FLAGS_client_port)); UDPMsgHandler* msgHandler = new UDPMsgHandler(ReplyHandlerFunc); replyEP->RegisterMsgHandler(msgHandler); replyEP->LoopRun(); } int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, true); google::InitGoogleLogging(argv[0]); FLAGS_logtostderr = 1; Endpoint* requestEP = CreateEndpoint(EndpointType::UDP_ENDPOINT, "", -1, true); LOG(INFO) << "ClientId = " << FLAGS_client_id << "\t" << " rate=" << FLAGS_poisson_rate; std::vector reqPer10msVec; reqPer10msVec.reserve(FLAGS_duration * 100); std::default_random_engine generator( FLAGS_client_id); // clientId as the seed std::poisson_distribution distribution(FLAGS_poisson_rate / 100); for (uint32_t i = 0; i < FLAGS_duration * 100; i++) { reqPer10msVec.push_back(distribution(generator)); } uint32_t maxReqId = FLAGS_poisson_rate * (FLAGS_duration - 10); std::thread* replyTd = new std::thread(OWDUpdate); uint32_t reqCnt = 0; std::vector receiverAddrs; receiverAddrs.resize(5, NULL); receiverAddrs[0] = new Address(FLAGS_receiver_1_ip, FLAGS_receiver_1_port); receiverAddrs[1] = new Address(FLAGS_receiver_2_ip, FLAGS_receiver_2_port); receiverAddrs[2] = new Address(FLAGS_receiver_3_ip, FLAGS_receiver_3_port); receiverAddrs[3] = new Address(FLAGS_receiver_4_ip, FLAGS_receiver_4_port); receiverAddrs[4] = new Address(FLAGS_receiver_5_ip, FLAGS_receiver_5_port); nezha::proto::Request request; request.set_clientid(FLAGS_client_id); srand(FLAGS_client_id); for (uint32_t i = 0; i < reqPer10msVec.size(); i++) { uint32_t reqNum = reqPer10msVec[i]; if (reqNum <= 0) { usleep(10000); } else { uint32_t intval = 10000 / reqNum; uint64_t nowTime = GetMicrosecondTimestamp(); for (uint32_t j = 1; j <= reqNum; j++) { while (GetMicrosecondTimestamp() < nowTime + intval * j) { } uint64_t sendTime = GetMicrosecondTimestamp(); request.set_sendtime(sendTime); request.set_bound(bound); request.set_reqid(reqCnt + 1); for (int k = 0; k < FLAGS_receiver_num; k++) { requestEP->SendMsgTo(*(receiverAddrs[k]), request, MessageType::CLIENT_REQUEST); } reqCnt++; if (reqCnt >= maxReqId) { LOG(INFO) << "reqCnt=" << reqCnt << "\tTerminate Here"; exit(0); } } } } delete requestEP; replyTd->join(); delete replyTd; } ================================================ FILE: micro-bench/launch_micro.py ================================================ import os import subprocess from subprocess import PIPE, Popen import time import ruamel.yaml from termcolor import colored import argparse LOGIN_PATH = "/home/steam1994" TAG = "opensource-test" SSH_KEY = "/home/steam1994/.ssh/id_rsa" ssh_identity = '-i {}'.format(SSH_KEY) if SSH_KEY else '' # Prefix for SSH and SCP. SSH = 'ssh {} -q -o ConnectTimeout=2 -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no '.format( ssh_identity) SCP = 'scp -r {} -q -o ConnectTimeout=2 -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no '.format( ssh_identity) USERNAME = "steam1994" CMD_RETRY_TIMES = 3 def generate_ttcs_cfg_file(internal_ip, is_reference=False, use_ntp=False): if is_reference: content_str = '''management_address: "InternalIP" log_dir: "/var/opt/ttcs/log" subscription_mode: true coordinator_address: "c-gjk1994gjk1994-c89e.gcp.clockwork.io" coordinator_subscription_service_port: 6176 probe_address: "InternalIP" clock_quality: 10 correct_clock: false''' cfg_file = content_str.replace("InternalIP", internal_ip) cfg_file_name = "ttcs-agent.cfg" with open(cfg_file_name, "w") as f: f.write(cfg_file) f.close() return cfg_file_name else: if use_ntp: content_str = '''management_address: "InternalIP" log_dir: "/var/opt/ttcs/log" subscription_mode: true coordinator_address: "c-gjk1994gjk1994-c89e.gcp.clockwork.io" coordinator_subscription_service_port: 6176 probe_address: "InternalIP" clock_quality: 1 correct_clock: false''' else: content_str = '''management_address: "InternalIP" log_dir: "/var/opt/ttcs/log" subscription_mode: true coordinator_address: "c-gjk1994gjk1994-c89e.gcp.clockwork.io" coordinator_subscription_service_port: 6176 probe_address: "InternalIP" clock_quality: 1 correct_clock: true''' cfg_file = content_str.replace("InternalIP", internal_ip) cfg_file_name = "ttcs-agent.cfg" with open(cfg_file_name, "w") as f: f.write(cfg_file) f.close() return cfg_file_name def retry_proc_error(procs_list): procs_error = [] for server, proc, cmd in procs_list: output, err = proc.communicate() if proc.returncode != 0: proc = Popen(cmd.split(), stdout=PIPE, stderr=PIPE) procs_error.append((server, proc, cmd)) return procs_error def start_ttcs_node(internal_ip, is_reference, use_ntp=False): clean_prev_deb_cmd = "sudo dpkg -P ttcs-agent" run_command([internal_ip], clean_prev_deb_cmd, in_background=False) install_deb_cmd = "sudo dpkg -i /home/steam1994/ttcs-agent_1.0.21_amd64.deb" #install_deb_cmd = "sudo dpkg -i /root/ttcs-agent_1.0.12_amd64.deb" run_command([internal_ip], install_deb_cmd, in_background=False) cfg_file = generate_ttcs_cfg_file(internal_ip, is_reference, use_ntp) local_file_path = "./ttcs-agent.cfg" remote_dir = "/etc/opt/ttcs" remote_path = remote_dir + "/ttcs-agent.cfg" chmod_cmd = "sudo chmod -R 777 {remote_dir}".format(remote_dir=remote_dir) run_command([internal_ip], chmod_cmd, in_background=False) rm_cmd = "sudo rm -f {remote_path}".format(remote_path=remote_path) run_command([internal_ip], rm_cmd, in_background=False) scp_files([internal_ip], local_file_path, remote_path, to_remote=True) if is_reference is not True and use_ntp is False: stop_ntp_cmd = "sudo systemctl stop ntp" run_command([internal_ip], stop_ntp_cmd, in_background=False) disable_ntp_cmd = "sudo systemctl disable ntp" run_command([internal_ip], disable_ntp_cmd, in_background=False) stop_ntp_cmd = "sudo systemctl stop chronyd" run_command([internal_ip], stop_ntp_cmd, in_background=False) disable_ntp_cmd = "sudo systemctl disable chronyd" run_command([internal_ip], disable_ntp_cmd, in_background=False) else: enable_ntp_cmd = "sudo systemctl enable chronyd" run_command([internal_ip], enable_ntp_cmd, in_background=False) start_ntp_cmd = "sudo systemctl start chronyd" run_command([internal_ip], start_ntp_cmd, in_background=False) sys_start_ttcp_agent_cmd = "sudo systemctl start ttcs-agent" run_command([internal_ip], sys_start_ttcp_agent_cmd, in_background=False) def launch_ttcs(server_ip_list): stop_ntp_cmd = "sudo systemctl stop chronyd" run_command(server_ip_list, stop_ntp_cmd, in_background=False) disable_ntp_cmd = "sudo systemctl disable chronyd" run_command(server_ip_list, disable_ntp_cmd, in_background=False) stop_ntp_cmd = "sudo systemctl stop ntp" run_command(server_ip_list, stop_ntp_cmd, in_background=False) disable_ntp_cmd = "sudo systemctl disable ntp" run_command(server_ip_list, disable_ntp_cmd, in_background=False) sys_start_ttcp_agent_cmd = "sudo systemctl start ttcs-agent" run_command(server_ip_list, sys_start_ttcp_agent_cmd, in_background=False) def scp_files(server_ip_list, local_path_to_file, remote_dir, to_remote): ''' copies the file in 'local_path_to_file' to the 'remote_dir' in all servers whose external ip addresses are in 'server_ip_list' args server_ip_list: list of external IP addresses to communicate with local_path_to_file: e.g. ./script.py remote_dir: e.g. ~ to_remote: whether to copy to remote (true) or vice versa (false) returns boolean whether operation was succesful on all servers or not ''' src = remote_dir if not to_remote else local_path_to_file src_loc = 'remote' if not to_remote else 'local' dst = remote_dir if to_remote else local_path_to_file dst_loc = 'remote' if to_remote else 'local' message = 'from ({src_loc}) {src} to ({dst_loc}) {dst}'.format( src_loc=src_loc, src=src, dst_loc=dst_loc, dst=dst) print('---- started scp {}'.format(message)) procs = [] for server in server_ip_list: if to_remote: cmd = '{} {} {}@{}:{}'.format(SCP, local_path_to_file, USERNAME, server, remote_dir) proc = Popen(cmd.split(), stdout=PIPE, stderr=PIPE) else: cmd = '{} {}@{}:{} {}'.format(SCP, USERNAME, server, remote_dir, local_path_to_file) proc = Popen(cmd.split(), stdout=PIPE, stderr=PIPE) # print("scp cmd ", cmd) procs.append((server, proc, cmd)) success = True procs_error = retry_proc_error(procs) retries = 1 while retries < CMD_RETRY_TIMES and procs_error: procs_error = retry_proc_error(procs) retries += 1 if retries >= CMD_RETRY_TIMES and procs_error: success = False for server, proc, cmd in procs_error: output, err = proc.communicate() if proc.returncode != 0: print( colored('[{}]: FAIL SCP - [{}]'.format(server, cmd), 'yellow')) print(colored('Error Response:', 'blue', attrs=['bold']), proc.returncode, output, err) if success: print( colored('---- SUCCESS SCP {} on {}'.format(message, str(server_ip_list)), 'green', attrs=['bold'])) else: print( colored('---- FAIL SCP {}'.format(message), 'red', attrs=['bold'])) return success def run_command(server_ip_list, cmd, in_background=True): ''' runs the command 'cmd' in all servers whose external ip addresses are in 'server_ip_list' cfg server_ip_list: list of external IP addresses to communicate with cmd: command to run returns boolean whether operation was succesful on all servers or not ''' if not in_background: print('---- started to run command - [{}] on {}'.format( cmd, str(server_ip_list))) else: print( colored('---- started to run [IN BACKGROUND] command - [{}] on {}'. format(cmd, str(server_ip_list)), 'blue', attrs=['bold'])) procs = [] for server in server_ip_list: ssh_cmd = '{} {}@{} {}'.format(SSH, USERNAME, server, cmd) proc = Popen(ssh_cmd.split(), stdout=PIPE, stderr=PIPE) procs.append((server, proc, ssh_cmd)) success = True output = '' if not in_background: procs_error = retry_proc_error(procs) retries = 1 while retries < CMD_RETRY_TIMES and procs_error: procs_error = retry_proc_error(procs) retries += 1 if retries >= CMD_RETRY_TIMES and procs_error: success = False for server, proc, cmd in procs_error: output, err = proc.communicate() if proc.returncode != 0: print( colored( '[{}]: FAIL run command - [{}]'.format( server, cmd), 'yellow')) print(colored('Error Response:', 'blue', attrs=['bold']), proc.returncode, output, err) if success: print( colored('---- SUCCESS run command - [{}] on {}'.format( cmd, str(server_ip_list)), 'green', attrs=['bold'])) else: print( colored('---- FAIL run command - [{}]'.format(cmd), 'red', attrs=['bold'])) return success, output def create_instance(instance_name, image=None, machine_type = "n1-standard-4", customzedZone = "us-central1-a", customzedIp = None, require_external_ip=False, second_ip = False ): # Construct gcloud command to create instance. network_address_config = ("--network-interface no-address" if require_external_ip == False else "") if customzedIp is not None: network_address_config += ",private-network-ip="+customzedIp if second_ip: network_address_config += " --network-interface subnet=subnet-1,no-address" # scopes = "--scopes storage-full,https://www.googleapis.com/auth/bigtable.admin,https://www.googleapis.com/auth/bigtable.data,https://www.googleapis.com/auth/bigquery" # if full_access_to_cloud_apis: scopes = "--scopes=https://www.googleapis.com/auth/cloud-platform" create_instance_cmd = """gcloud beta compute instances create {inst} --zone {zone} --image-family {source_image} --machine-type {machine_type} {network} {scopes} --boot-disk-size 50GB""".format( inst=instance_name, zone=customzedZone, source_image=image, machine_type=machine_type, network=network_address_config, scopes=scopes, ) # print(create_instance_cmd) # Run gcloud command to create machine. proc = Popen(create_instance_cmd, stdout=PIPE, stderr=PIPE, shell=True) # Wait for the process end and print error in case of failure output, error = proc.communicate() if proc.returncode != 0: print(colored("Failed to create instance", color="red", attrs=["bold"])) print(colored("Error Response: ", color="blue", attrs=["bold"]), output, error) def del_instance_list(instance_list, zone="us-central1-a"): for machine in instance_list: print(colored("Deleting "+machine, "red", attrs=['bold'])) subprocess.Popen( 'gcloud -q compute instances delete {inst} --zone {zone}'.format( inst=machine, zone=zone).split()) def stop_instance_list(instance_list, zone="us-central1-a"): stop_cmd = 'gcloud compute instances stop {inst} --zone {zone}'.format( inst=' '.join(instance_list), zone = zone ) print(stop_cmd) os.system(stop_cmd) def start_instance_list(instance_list, zone="us-central1-a"): start_cmd = 'gcloud compute instances start {inst} --zone {zone}'.format( inst=' '.join(instance_list), zone = zone ) print(start_cmd) os.system(start_cmd) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('--num_replicas', type=int, default = 3, help='Specify the number of replicas ') parser.add_argument('--num_proxies', type=int, default = 2, help='Specify the number of proxies ') parser.add_argument('--num_clients', type=int, default = 10, help='Specify the number of clients ') args = parser.parse_args() num_replicas = args.num_replicas num_proxies = args.num_proxies num_clients = args.num_clients print("replicas: ", num_replicas) print("proxies: ", num_proxies) print("clients: ", num_clients) # cfg_file_name = generate_ttcs_cfg_file("10.128.3.79", is_reference=True, use_ntp=False) replica_ips = ["10.128.2."+str(i+10) for i in range(10)] proxy_ips = ["10.128.2."+str(i+20) for i in range(10) ] client_ips = ["10.128.2."+str(i+30) for i in range(100) ] replica_ips = replica_ips[0:num_replicas] proxy_ips = proxy_ips[0:num_proxies] client_ips = client_ips[0:num_clients] replica_name_list = [TAG+"-replica-"+str(i) for i in range(num_replicas) ] proxy_name_list = [ TAG+"-proxy-"+str(i) for i in range(num_proxies) ] client_name_list = [ TAG+"-client-"+str(i) for i in range(num_clients) ] vm_ips = replica_ips + proxy_ips + client_ips vm_name_list = replica_name_list + proxy_name_list + client_name_list replica_vm_type = "n1-standard-16" proxy_vm_type = "n1-standard-32" client_vm_type = "n1-standard-4" binary_path = "{login_path}/nezhav2/bazel-bin/".format(login_path = LOGIN_PATH) config_path = "{login_path}/nezhav2/configs".format(login_path = LOGIN_PATH) yaml = ruamel.yaml.YAML() # for i in range(num_replicas): # create_instance(instance_name = replica_name_list[i], # image= "opensource-nezha", # machine_type = replica_vm_type, # customzedZone="us-central1-a", # customzedIp = replica_ips[i] ) # print(colored("Created "+replica_name_list[i], "green", attrs=['bold'])) # exit(0) # for i in range(num_proxies): # create_instance(instance_name = proxy_name_list[i], # image= "opensource-nezha", # machine_type = proxy_vm_type, # customzedZone="us-central1-a", # customzedIp = proxy_ips[i] ) # print(colored("Created "+proxy_name_list[i], "green", attrs=['bold'])) # for i in range(num_clients): # create_instance(instance_name = client_name_list[i], # image= "opensource-nezha", # machine_type = client_vm_type, # customzedZone="us-central1-a", # customzedIp = client_ips[i] ) # print(colored("Created "+client_name_list[i], "green", attrs=['bold'])) # time.sleep(120) # for i in range(len(vm_ips)): # start_ttcs_node(vm_ips[i],False) # exit(0) #### del_instance_list(instance_list=vm_name_list) # stop_instance_list(instance_list = vm_name_list) # exit(0) # start_instance_list(instance_list = vm_name_list) # time.sleep(60) # print(vm_ips) # launch_ttcs(vm_ips) # exit(0) # start_ttcs_node(replica_ips[3],False) # exit(0) test_no = 1 enable_dom =1 # enable_dom = 1 #poisson_rate = 10000 poisson_rate = 5000 percentile = 50 while len(replica_ips) < 5: replica_ips += ["127.0.0.1"] print(replica_ips) for test_no in range(1,6): for percentile in [50]: #[50,75,90,95]: remote_path = "{login_path}/nezhav2/bazel-bin/*".format(login_path = LOGIN_PATH) rm_cmd = "sudo rm -rf {remote_path}".format(remote_path=remote_path) run_command(vm_ips, rm_cmd, in_background=False) mkdir_cmd = "mkdir -p {binary_path}/micro-bench".format(binary_path = binary_path) run_command(vm_ips, mkdir_cmd, in_background=False) binary_file = "{binary_path}/micro-bench/bench_sender".format(binary_path=binary_path) scp_files(vm_ips, binary_file, binary_file, to_remote = True) binary_file = "{binary_path}/micro-bench/bench_receiver".format(binary_path=binary_path) scp_files(vm_ips, binary_file, binary_file, to_remote = True) # Kill existing procs kill_cmd = "sudo pkill -9 bench_receiver" run_command(vm_ips, kill_cmd, in_background=False) kill_cmd = "sudo pkill -9 bench_sender" run_command(vm_ips, kill_cmd, in_background=False) rm_cmd = "sudo rm -rf Replica-Stats*.csv" run_command(vm_ips, rm_cmd, in_background=False) ## Launch replicas (id starts from 0) for i in range(num_replicas): replica_cmd = "{binary_path}/micro-bench/bench_receiver --receiver_ip {ip} --replica_id {id} --enable_dom {enable_dom} --percentile {percentile} >{log_file} 2>&1 &".format( binary_path = binary_path, ip = replica_ips[i], id = i, enable_dom = enable_dom, percentile = percentile, log_file = "receiver-log-"+str(i) ) print(colored(replica_cmd, "yellow", attrs=['bold'])) run_command([replica_ips[i]], replica_cmd, in_background=False) # Launch clients (id starts from 2) for i in range(num_clients): client_cmd = "{binary_path}/micro-bench/bench_sender --receiver_1_ip {ip1} --receiver_2_ip {ip2} --receiver_3_ip {ip3} --receiver_4_ip {ip4} --receiver_5_ip {ip5} --receiver_num {receiver_num} --client_ip {myip} --poisson_rate {poisson_rate} --client_id {id} >{log_file} 2>&1 &".format( binary_path = binary_path, ip1 = replica_ips[0], ip2 = replica_ips[1], ip3 = replica_ips[2], ip4 = replica_ips[3], ip5 = replica_ips[4], receiver_num = num_replicas, myip = client_ips[i], poisson_rate = poisson_rate, id = i+1, log_file = "client-log-"+str(i+1) ) print(colored(client_cmd, "yellow", attrs=['bold'])) run_command([client_ips[i]], client_cmd, in_background = True) # exit(0) print("Sleep...") time.sleep(90) # Copy Stats File folder_name = "micro-stats" sub_folder_name = "T-{test_no}-{num_replicas}-{num_clients}-{poisson_rate}-{enable_dom}-{percentile}".format( test_no = test_no, num_replicas = num_replicas, num_clients = num_clients, poisson_rate = poisson_rate, enable_dom = enable_dom, percentile = percentile ) stats_folder = "{login_path}/{folder_name}/{sub_folder_name}".format( login_path = LOGIN_PATH, folder_name = folder_name, sub_folder_name = sub_folder_name ) mkdir_cmd = "sudo mkdir -p -m 777 {stats_folder}".format(stats_folder = stats_folder) os.system(mkdir_cmd) for i in range(num_replicas): file_name = "Replica-Stats-"+str(i)+".csv" local_file_path = "{stats_folder}/{file_name}".format( stats_folder = stats_folder, file_name = file_name ) remote_path = "{stats_folder}/{file_name}".format( stats_folder = LOGIN_PATH, file_name = file_name ) scp_files([replica_ips[i]], local_file_path, remote_path, to_remote=False) ================================================ FILE: proto/BUILD ================================================ load("@rules_proto//proto:defs.bzl", "proto_library") proto_library( name = "nezha_proto", srcs = ["nezha_proto.proto"], visibility = ["//visibility:public"], ) cc_proto_library( name = "nezha_cc_proto", deps = [":nezha_proto"], visibility = ["//visibility:public"], ) ================================================ FILE: proto/nezha_proto.proto ================================================ syntax = "proto3"; package nezha.proto; message Request { uint64 sendtime = 1; uint32 bound=2; uint32 clientid = 3; uint32 reqid = 4; bytes command=5; uint64 proxyid = 6; uint32 key = 7; bool iswrite = 8; } message RequestBodyMsg { uint64 deadline = 1; uint64 reqkey = 2; uint64 proxyid = 3; bytes command = 4; uint32 key = 5; bool iswrite = 6; } message TimeStats { uint64 clienttime = 1; uint64 proxytime = 2; uint64 recvtime =3; uint64 fastreplytime = 4; uint64 slowreplytime= 5; uint64 deadline = 6; } message Reply { uint32 clientid = 1; uint32 reqid = 2; uint32 view = 3; uint32 replicaid = 4; bytes hash = 5; bytes result = 6; uint32 replytype = 7; uint32 owd = 8; uint32 maxsyncedlogid = 9; // This is the largest syncedlogid of my synced logs uint32 logid = 10; // only set by the leader, it is the log id of the entry replied bool iswrite = 11; uint32 opkey = 12; } message IndexSync { uint32 logidbegin = 1; uint32 logidend = 2; repeated uint64 deadlines = 3; repeated uint64 reqkeys =4; uint32 view = 5; uint64 sendtime = 6; } message AskIndex { uint32 logidbegin = 1; uint32 logidend = 2; uint32 replicaid = 3; } message AskReq { repeated uint64 missedreqkeys = 1; uint32 replicaid = 2; } message MissedReq { repeated RequestBodyMsg reqs = 1; uint32 replicaid = 2; } message ViewChangeRequest{ uint32 view = 1; uint32 replicaid = 2; repeated uint32 cv = 3; } message ViewChange { uint32 view = 1; uint32 replicaid = 2; repeated uint32 cv = 3; uint32 lastnormalview= 4; // In the algo, we should include the logs in the viewchange msg // But that is too large. As an implementation optimization, Let's use the following information, and later do state transfer to get the necessary entries uint32 syncpoint = 5; // for synced logs: the max synced log id, no need to add syncbegin, because it is always CONCURRENT_MAP_START_IDX uint32 unsynclogbegin = 6; uint32 unsynclogend = 7; } message StateTransferRequest { uint32 view = 1; uint32 replicaid = 2; bool issynced = 3; uint32 logbegin = 4; uint32 logend = 5; } message StateTransferReply { uint32 view = 1; uint32 replicaid = 2; repeated uint32 cv = 3; bool issynced = 4; uint32 logbegin = 5; uint32 logend = 6; repeated RequestBodyMsg reqs = 7; } message StartView { uint32 view = 1; uint32 replicaid = 2; repeated uint32 cv = 3; uint32 syncedlogid = 4; } message CrashVectorRequest { bytes nonce = 1; uint32 replicaid = 2; } message CrashVectorReply { bytes nonce = 1; uint32 replicaid = 2; repeated uint32 cv = 3; } message RecoveryRequest { repeated uint32 cv = 1; uint32 replicaid = 2; } message RecoveryReply { uint32 view = 1; repeated uint32 cv = 2; uint32 replicaid = 3; uint32 syncedlogid = 4; } message SyncStatusReport { uint32 view = 1; repeated uint32 cv = 2; uint32 replicaid = 3; uint32 syncedlogid = 4; } message CommitInstruction { uint32 view = 1; repeated uint32 cv = 2; uint32 replicaid = 3; uint32 committedlogid = 4; } ================================================ FILE: proxy/BUILD ================================================ cc_library( name = "proxy_config", hdrs = ["proxy_config.h"], deps = [ "@com_github_jbeder_yaml_cpp//:yaml-cpp", ], ) cc_library( name = "proxy_class", srcs = ["proxy.cc"], hdrs = ["proxy.h"], deps = [ "//proto:nezha_cc_proto", "//lib:utils", "//lib:address", ":proxy_config", ], ) cc_binary( name = "nezha_proxy", srcs = ["proxy_run.cc"], deps = [ ":proxy_class", ], ) ================================================ FILE: proxy/proxy.cc ================================================ #include "proxy/proxy.h" namespace nezha { Proxy::Proxy(const std::string& configFile) { std::string error = proxyConfig_.parseConfig(configFile); if (error != "") { LOG(ERROR) << "Error parsing proxy config: " << error << "Exiting."; exit(1); } CreateContext(); } void Proxy::Terminate() { LOG(INFO) << "Terminating..."; running_ = false; } void Proxy::Run() { running_ = true; LaunchThreads(); for (auto& kv : threadPool_) { LOG(INFO) << "Join " << kv.first; kv.second->join(); LOG(INFO) << "Join Complete " << kv.first; } LOG(INFO) << "Run Terminated "; } Proxy::~Proxy() { for (auto& kv : threadPool_) { delete kv.second; } for (uint32_t i = 0; i < replicaAddrs_.size(); i++) { for (uint32_t j = 0; j < replicaAddrs_[0].size(); j++) { if (replicaAddrs_[i][j]) { delete replicaAddrs_[i][j]; } } } // Clear Context (free memory) ConcurrentMap::Iterator clientIter( clientAddrs_); while (clientIter.isValid()) { if (clientIter.getValue()) { delete clientIter.getValue(); } clientIter.next(); } // for (uint32_t i = 0; i < committedReplyMap_.size(); i++) { // ConcurrentMap& committedReply = committedReplyMap_[i]; // ConcurrentMap::Iterator iter(committedReply); // while (iter.isValid()) { // Reply* reply = iter.getValue(); // if (reply) { // delete reply; // } // iter.next(); // } // } } int Proxy::CreateSocketFd(const std::string& sip, const int sport) { int fd = socket(PF_INET, SOCK_DGRAM, 0); if (fd < 0) { LOG(ERROR) << "Receiver Fd fail "; return -1; } // Set Non-Blocking int status = fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_NONBLOCK); if (status < 0) { LOG(ERROR) << " Set NonBlocking Fail"; return -1; } if (sip != "") { struct sockaddr_in addr; bzero(&addr, sizeof(addr)); addr.sin_family = AF_INET; addr.sin_port = htons(sport); addr.sin_addr.s_addr = inet_addr(sip.c_str()); // Bind socket to Address int bindRet = bind(fd, (struct sockaddr*)&addr, sizeof(addr)); if (bindRet != 0) { LOG(ERROR) << "bind error\t" << bindRet; return -1; } } return fd; } void Proxy::LaunchThreads() { int shardNum = proxyConfig_.proxyShardNum; threadPool_["CalcLatencyBound"] = new std::thread(&Proxy::CalculateLatencyBoundTd, this); for (int i = 0; i < shardNum; i++) { std::string key = "CheckQuorumTd-" + std::to_string(i); threadPool_[key] = new std::thread(&Proxy::CheckQuorumTd, this, i); } for (int i = 0; i < shardNum; i++) { std::string key = "ForwardRequestsTd-" + std::to_string(i); threadPool_[key] = new std::thread(&Proxy::ForwardRequestsTd, this, i); } // std::string key = "LogTd"; // threadPool_[key] = new std::thread(&Proxy::LogTd, this); } void Proxy::CalculateLatencyBoundTd() { std::pair owdSample; std::vector replicaOWDs; replicaOWDs.resize(proxyConfig_.replicaIps.size(), proxyConfig_.replicaInitialOwd); for (uint32_t i = 0; i < replicaOWDs.size(); i++) { LOG(INFO) << "replicaOWD " << i << "\t" << replicaOWDs[i]; } while (running_) { while (owdQu_.try_dequeue(owdSample)) { VLOG(1) << "replica=" << owdSample.first << "\towd=" << owdSample.second; replicaOWDs[owdSample.first] = owdSample.second; // Update latency bound uint32_t estimatedOWD = 0; for (uint32_t i = 0; i < replicaOWDs.size(); i++) { if (estimatedOWD < replicaOWDs[i]) { estimatedOWD = replicaOWDs[i]; } } if (estimatedOWD > maxOWD_) { estimatedOWD = maxOWD_; } latencyBound_.store(estimatedOWD); VLOG(1) << "Update bound " << latencyBound_; } usleep(5000); } } void Proxy::LogTd() { Log litem; std::ofstream ofs("Proxy-Stats-" + std::to_string(proxyConfig_.proxyId) + ".csv"); ofs << "ReplicaId,ClientId,RequestId,ClientTime,ProxyTime," "ProxyEndProcessTime,RecvTime,Deadline," "FastReplyTime," "SlowReplyTime," "ProxyRecvTime,CommitType" << std::endl; uint32_t logCnt = 0; while (running_) { if (logQu_.try_dequeue(litem)) { ofs << litem.ToString() << std::endl; logCnt++; if (logCnt % 10000 == 0) { ofs.flush(); } } } } void Proxy::CheckQuorumTd(const int id) { // ConcurrentMap& committedReply = committedReplyMap_[id]; std::unordered_map& committedReply = committedReplyMap_[id]; ConcurrentMap& logs = logMap_[id]; std::map> replyQuorum; std::map uncommittedReply; // Key: logId, value: reqKey uint32_t currentView = 0; int sz = 0; char buffer[UDP_BUFFER_SIZE]; MessageHeader* msgHdr = (MessageHeader*)(void*)buffer; struct sockaddr_in recvAddr; socklen_t sockLen = sizeof(recvAddr); Reply reply; Reply* committedAck = NULL; uint32_t replyNum = 0; uint64_t startTime, endTime; std::vector& replicaSyncedPoint = replicaSyncedPoints_[id]; while (running_) { if ((sz = recvfrom(forwardFds_[id], buffer, UDP_BUFFER_SIZE, 0, (struct sockaddr*)(&recvAddr), &sockLen)) > 0) { if ((uint32_t)sz < sizeof(MessageHeader) || (uint32_t)sz < msgHdr->msgLen + sizeof(MessageHeader)) { continue; } if (reply.ParseFromArray(buffer + sizeof(MessageHeader), msgHdr->msgLen)) { uint64_t reqKey = CONCAT_UINT32(reply.clientid(), reply.reqid()); if (reply.owd() > 0) { owdQu_.enqueue( std::pair(reply.replicaid(), reply.owd())); } uint64_t syncPoint = CONCAT_UINT32(reply.view(), reply.maxsyncedlogid()); if (replicaSyncedPoint[reply.replicaid()] < syncPoint) { replicaSyncedPoint[reply.replicaid()] = syncPoint; } if (reply.clientid() == 0 && reply.reqid() == 0) { // Dummy reply, just used to update continue; } // committedAck = committedReply.get(reqKey); // if (committedAck != NULL) { // // already committed; ignore // continue; // } auto iter = committedReply.find(reqKey); if (iter != committedReply.end()) { // already committed; ignore continue; } if (reply.view() < currentView) { LOG(INFO) << "Replied from old view"; continue; } if (currentView < reply.view()) { // Replicas have upgraded to a new view // Reset current state currentView = reply.view(); uncommittedReply.clear(); replyQuorum.clear(); for (int i = 0; i < replicaNum_; i++) { replicaSyncedPoint[i] = replicaSyncedPoint[reply.replicaid()]; } currentView = reply.view(); } // LOG(INFO) << reply.DebugString(); if (reply.replytype() == (uint32_t)MessageType::COMMIT_REPLY) { committedAck = new Reply(reply); // committedReply.assign(reqKey, committedAck); } else if (replyQuorum[reqKey].find(reply.replicaid()) == replyQuorum[reqKey].end()) { replyQuorum[reqKey][reply.replicaid()] = reply; committedAck = isQuorumReady(replicaSyncedPoint, replyQuorum[reqKey]); } else if (reply.view() > replyQuorum[reqKey].begin()->second.view()) { // New view has come, clear existing replies for this request uncommittedReply.clear(); replyQuorum[reqKey].clear(); replyQuorum[reqKey][reply.replicaid()] = reply; for (int i = 0; i < replicaNum_; i++) { replicaSyncedPoint[i] = replicaSyncedPoint[reply.replicaid()]; } committedAck = isQuorumReady(replicaSyncedPoint, replyQuorum[reqKey]); } else if (reply.view() == replyQuorum[reqKey].begin()->second.view()) { const Reply& existedReply = replyQuorum[reqKey][reply.replicaid()]; if (existedReply.view() < reply.view()) { replyQuorum[reqKey][reply.replicaid()] = reply; } else if (existedReply.view() == reply.view() && existedReply.replytype() < reply.replytype()) { // FAST_REPLY < SLOW_REPLY < COMMIT_REPLY replyQuorum[reqKey][reply.replicaid()] = reply; } committedAck = isQuorumReady(replicaSyncedPoint, replyQuorum[reqKey]); } // else: reply.view()< replyQuorum[reqKey].begin()->second.view(), // ignore it if (committedAck != NULL && committedAck->replytype() > 0) { // Ack to client struct sockaddr_in* clientAddr = clientAddrs_.get(committedAck->clientid()); std::string replyMsg = committedAck->SerializeAsString(); msgHdr->msgType = MessageType::COMMIT_REPLY; msgHdr->msgLen = replyMsg.length(); memcpy(buffer + sizeof(MessageHeader), replyMsg.c_str(), replyMsg.length()); sendto(replyFds_[id], buffer, replyMsg.length() + sizeof(MessageHeader), 0, (struct sockaddr*)clientAddr, sizeof(sockaddr)); // Add to cache // committedReply.assign(reqKey, committedAck); committedReply[reqKey] = committedAck; replyQuorum.erase(reqKey); // Disable Log // Log* litem = logs.get(reqKey); // if (litem) { // litem->proxyRecvTime_ = GetMicrosecondTimestamp(); // litem->commitType_ = committedAck->replytype(); // logQu_.enqueue(*litem); // } // Check whether some uncommittedReply can be committed while ((!uncommittedReply.empty()) && uncommittedReply.begin()->first <= committedAck->logid()) { Reply* ack = uncommittedReply.begin()->second; ack->set_replytype(MessageType::COMMIT_REPLY); if (uncommittedReply.begin()->first < committedAck->logid()) { const Reply* ack = uncommittedReply.begin()->second; struct sockaddr_in* clientAddr = clientAddrs_.get(ack->clientid()); std::string replyMsg = ack->SerializeAsString(); msgHdr->msgType = MessageType::COMMIT_REPLY; msgHdr->msgLen = replyMsg.length(); memcpy(buffer + sizeof(MessageHeader), replyMsg.c_str(), replyMsg.length()); sendto(replyFds_[id], buffer, replyMsg.length() + sizeof(MessageHeader), 0, (struct sockaddr*)clientAddr, sizeof(sockaddr)); } uint64_t reqKey = CONCAT_UINT32(ack->clientid(), ack->reqid()); // committedReply.assign(reqKey, ack); committedReply[reqKey] = ack; replyQuorum.erase(reqKey); uncommittedReply.erase(uncommittedReply.begin()); delete ack; } // LOG(INFO) << "reqId=" << committedAck->reqid() // << "\t type=" << committedAck->replytype(); // replyNum++; // if (replyNum == 1) { // startTime = GetMicrosecondTimestamp(); // } else if (replyNum % 100000 == 0) { // endTime = GetMicrosecondTimestamp(); // float rate = 100000 / ((endTime - startTime) * 1e-6); // LOG(INFO) << "id=" << id << "\t" // << "replyNum=" << replyNum << "\t" // << "rate = " << rate << "\t" // << "uncommittedLen = " << uncommittedReply.size(); // startTime = endTime; // } } else if (committedAck != NULL && committedAck->replytype() == 0) { // record in uncommittedRequests if (committedAck->replicaid() == currentView % replicaNum_) { // This is a leader's reply, cache it if (uncommittedReply.find(committedAck->logid()) == uncommittedReply.end()) { uncommittedReply[committedAck->logid()] = committedAck; } } else { delete committedAck; } } } } } } Reply* Proxy::isQuorumReady(std::vector& replicaSyncedPoint, std::map& quorum) { // These replies are of the same view for sure (we have previously forbidden // inconsistency) uint32_t view = quorum.begin()->second.view(); uint32_t leaderId = view % replicaNum_; if (quorum.find(leaderId) == quorum.end()) { return NULL; } Reply& leaderReply = quorum[leaderId]; uint32_t fastOrSlowReplyNum = 0; // slowReply can be used as fastReply uint32_t slowReplyNum = 0; // But fastReply cannot be used as slowReply for (const auto& kv : quorum) { bool fastSatisfied = (kv.second.replytype() == MessageType::FAST_REPLY && kv.second.view() == leaderReply.view() && kv.second.hash() == leaderReply.hash()); bool slowSatisfied = (HIGH_32BIT(replicaSyncedPoint[kv.first]) == leaderReply.view() && LOW_32BIT(replicaSyncedPoint[kv.first]) >= leaderReply.logid()); // if (kv.second.replytype() == MessageType::FAST_REPLY && // kv.second.hash() != leaderReply.hash()) { // LOG(INFO) << kv.second.DebugString() // << "\t\t\nLeader: " << leaderReply.DebugString(); // } if (fastSatisfied || slowSatisfied) { fastOrSlowReplyNum++; } if (slowSatisfied) { slowReplyNum++; } // if( (!fastSatisfied) && (!slowSatisfied) && quorum.size()==3) { // LOG(INFO) <<"Wrong "<= leaderReply.logid()) ; // } } Reply* committedReply = new Reply(leaderReply); if (fastOrSlowReplyNum >= (uint32_t)fastQuorum_) { // Fast Commit committedReply->set_replytype(MessageType::FAST_REPLY); } else if (slowReplyNum >= (uint32_t)f_ + 1) { // Slow Commit: Together with the leader reply, it forms the simple quorum // of f+1 committedReply->set_replytype(MessageType::SLOW_REPLY); } else { // Uncommitted // if(quorum.size()==3) { // LOG(INFO) <<"fastOrSlowReplyNum="<set_replytype(0); } return committedReply; } void Proxy::ForwardRequestsTd(const int id) { // ConcurrentMap& committedReply = committedReplyMap_[id]; ConcurrentMap& logs = logMap_[id]; char buffer[UDP_BUFFER_SIZE]; MessageHeader* msgHdr = (MessageHeader*)(void*)buffer; int sz = -1; struct sockaddr_in receiverAddr; socklen_t len = sizeof(receiverAddr); Request request; uint32_t forwardCnt = 0; uint64_t startTime, endTime; while (running_) { if ((sz = recvfrom(requestReceiveFds_[id], buffer, UDP_BUFFER_SIZE, 0, (struct sockaddr*)&receiverAddr, &len)) > 0) { if ((uint32_t)sz < sizeof(MessageHeader) || (uint32_t)sz < msgHdr->msgLen + sizeof(MessageHeader)) { continue; } if (msgHdr->msgType == MessageType::CLIENT_REQUEST && request.ParseFromArray(buffer + sizeof(MessageHeader), msgHdr->msgLen)) { uint64_t reqKey = CONCAT_UINT32(request.clientid(), request.reqid()); request.set_bound(latencyBound_); request.set_proxyid(proxyIds_[id]); request.set_sendtime(GetMicrosecondTimestamp()); std::string msg = request.SerializeAsString(); msgHdr->msgType = MessageType::CLIENT_REQUEST; msgHdr->msgLen = msg.length(); memcpy(buffer + sizeof(MessageHeader), msg.c_str(), msg.length()); if (clientAddrs_.get(request.clientid()) == NULL) { struct sockaddr_in* addr = new sockaddr_in(receiverAddr); clientAddrs_.assign(request.clientid(), addr); } // Send to every replica for (int i = 0; i < replicaNum_; i++) { // uint32_t generateProxyId = (uint32_t)(proxyIds_[id] >> 32u); // struct sockaddr_in* replicaAddr = // replicaAddrs_[i][generateProxyId % replicaAddrs_[i].size()]; struct sockaddr_in* replicaAddr = replicaAddrs_[i][proxyIds_[id] % replicaAddrs_[i].size()]; sendto(forwardFds_[id], buffer, msgHdr->msgLen + sizeof(MessageHeader), 0, (struct sockaddr*)replicaAddr, sizeof(sockaddr_in)); } // Log* litem = new Log(); // litem->clientId_ = request.clientid(); // litem->reqId_ = request.reqid(); // litem->clientTime_ = request.clienttime(); // litem->proxyTime_ = request.sendtime(); // litem->deadline_ = request.sendtime() + request.bound(); // logs.assign(reqKey, litem); // litem->proxyEndProcessTime_ = GetMicrosecondTimestamp(); // LOG(INFO) << "id=" << id << "\t" // << "cid=" << request.clientid() << "\t" << request.reqid(); // forwardCnt++; // if (forwardCnt == 1) { // startTime = GetMicrosecondTimestamp(); // } else if (forwardCnt % 100 == 0) { // endTime = GetMicrosecondTimestamp(); // float rate = 100 / ((endTime - startTime) * 1e-6); // LOG(INFO) << "Forward-Id=" << id << "\t" // << "count =" << forwardCnt << "\t" // << "rate=" << rate << " req/sec" // << "\t" // << "req is <" << request.clientid() << "," // << request.reqid() << ">"; // startTime = endTime; // } } } } } void Proxy::CreateContext() { running_ = true; int shardNum = proxyConfig_.proxyShardNum; uint32_t proxyId = proxyConfig_.proxyId; forwardFds_.resize(shardNum, -1); requestReceiveFds_.resize(shardNum, -1); replyFds_.resize(shardNum, -1); proxyIds_.resize(shardNum, proxyId); latencyBound_ = proxyConfig_.replicaInitialOwd; maxOWD_ = proxyConfig_.proxyMaxOwd; for (int i = 0; i < shardNum; i++) { forwardFds_[i] = CreateSocketFd(proxyConfig_.proxyIp, proxyConfig_.proxyReplyPortBase + i); requestReceiveFds_[i] = CreateSocketFd( proxyConfig_.proxyIp, proxyConfig_.proxyRequestPortBase + i); replyFds_[i] = CreateSocketFd("", -1); proxyIds_[i] = ((proxyIds_[i] << 32) | (uint32_t)i); } committedReplyMap_.resize(shardNum); logMap_.resize(shardNum); replicaNum_ = proxyConfig_.replicaIps.size(); assert(replicaNum_ % 2 == 1); f_ = replicaNum_ / 2; replicaSyncedPoints_.resize(shardNum); for (int i = 0; i < shardNum; i++) { replicaSyncedPoints_[i].assign(replicaNum_, CONCURRENT_MAP_START_INDEX); } fastQuorum_ = (f_ % 2 == 1) ? (f_ + (f_ + 1) / 2 + 1) : (f_ + f_ / 2 + 1); replicaAddrs_.resize(replicaNum_); for (int i = 0; i < replicaNum_; i++) { std::string replicaIP = proxyConfig_.replicaIps[i]; for (int j = 0; j < proxyConfig_.replicaReceiverShards; j++) { struct sockaddr_in* addr = new sockaddr_in(); bzero(addr, sizeof(struct sockaddr_in)); addr->sin_family = AF_INET; addr->sin_port = htons(proxyConfig_.replicaReceiverPort + j); addr->sin_addr.s_addr = inet_addr(replicaIP.c_str()); replicaAddrs_[i].push_back(addr); } } } } // namespace nezha ================================================ FILE: proxy/proxy.h ================================================ #include #include #include "lib/utils.h" #include "proto/nezha_proto.pb.h" #include "proxy_config.h" namespace nezha { using namespace nezha::proto; /** * Refer to proxy_run.cc, the runnable program only needs to instantiate a * Proxy object with a configuration file. Then it calls Run() method to run * and calls Terminate() method to stop */ class Proxy { private: /** All the configuration parameters for this proxy are included in * proxyConfig_*/ ProxyConfig proxyConfig_; /** Each thread is given a unique name (key) */ std::map threadPool_; /** Launch all the threads, these threads are mainly categorized into three * classes: * (1) ForwardRequestsTd, which receives client requests and * multicast to replicas; * (2) CheckQuorumTd, which receives replica replies and * check whether the corresponding request has been committed (use * isQuorumReady), if so, send a reply to the client; * (3) CalculateLatencyBoundTd, which caluldates the latency bound * * (1) and (2) handles most workload and is parallelized, and the parallism * degree is decided by the parameter defined in proxyConfig_ (i.e., * shard-num). * * (1) and (2) are paired, i.e., we launch equal number of * ForwardRequestsTds and CheckQuorumTds. The requests multicast by * ForwardRequestsTd-i will be tracked and quorum-checked by CheckQuorumTd-i */ void LaunchThreads(); void ForwardRequestsTd(const int id = -1); void CheckQuorumTd(const int id = -1); void CalculateLatencyBoundTd(); /** LogTd is just used to collect some performance stats. It is not necessary * in the release version */ void LogTd(); /** Create/Initialize all the necessary variables */ void CreateContext(); /** Check whether a quorum has been formed for the request to be committed. * If the request has been committed, it returns the reply message, which will * be delievered to the client; otherwise, it returns NULL */ Reply* isQuorumReady(std::vector& repliedSyncPoint, std::map& quorum); /** Tools function: given ip and port, create a socket fd. If ip is not empty, * the socket will be binded to the */ int CreateSocketFd(const std::string& ip = "", const int port = -1); /** Flag to Run/Terminate threads */ std::atomic running_; /** Each CheckQuorumTd thread uses the socket fd in replyFds_, based on its * id, to send reply to clients */ std::vector replyFds_; /** Each ForwardRequestsTd thread uses the socket fd in forwardFds_, based on * its id, to multicast requests to replicas */ std::vector forwardFds_; /** Each ForwardRequestsTd thread uses the socket fd in requestReceiveFds_, * based on its id, to receive requests from clients */ std::vector requestReceiveFds_; /** We create a unique id for each ForwardRequestsTd, so that replicas can * derive which CheckQuorumTd should receive the reply messages */ std::vector proxyIds_; /** CalculateLatencyBoundTd updates latencyBound_ and concurrently * ForwardRequestsTds read it and included in request messages */ std::atomic latencyBound_; /** Upper bound of the estimated latencyBound_, used to clamp the bound, * details in ``Adapative Latency Bound`` para of Sec 4 of our paper */ uint32_t maxOWD_; /** CheckQuorumTd threads pass samples to * CalculateLatencyBoundTd */ ConcurrentQueue> owdQu_; // int replicaNum_; int f_; /** replicaNum_ =2f_+1 */ int fastQuorum_; /** fastQuorum_ = f_+ceiling(f_/2)+1 */ /** Just used to collect logs, can be deleted in the release version*/ struct Log { uint32_t replicaId_; uint32_t clientId_; uint32_t reqId_; uint64_t clientTime_; uint64_t proxyTime_; uint64_t proxyEndProcessTime_; uint64_t recvTime_; uint64_t deadline_; uint64_t fastReplyTime_; uint64_t slowReplyTime_; uint64_t proxyRecvTime_; uint32_t commitType_; Log(uint32_t rid = 0, uint32_t cId = 0, uint32_t reqId = 0, uint64_t ctime = 0, uint64_t ptime = 0, uint64_t pedtime = 0, uint64_t rtime = 0, uint64_t ddl = 0, uint64_t fttime = 0, uint64_t swtime = 0, uint64_t prcvt = 0, uint32_t cmtt = 0) : replicaId_(rid), clientId_(cId), reqId_(reqId), clientTime_(ctime), proxyTime_(ptime), recvTime_(rtime), deadline_(ddl), fastReplyTime_(fttime), slowReplyTime_(swtime), proxyRecvTime_(prcvt), commitType_(cmtt) {} std::string ToString() { return std::to_string(replicaId_) + "," + std::to_string(clientId_) + "," + std::to_string(reqId_) + "," + std::to_string(clientTime_) + "," + std::to_string(proxyTime_) + "," + std::to_string(proxyEndProcessTime_) + "," + std::to_string(recvTime_) + "," + std::to_string(deadline_) + "," + std::to_string(fastReplyTime_) + "," + std::to_string(slowReplyTime_) + "," + std::to_string(proxyRecvTime_) + "," + std::to_string(commitType_); } }; ConcurrentQueue logQu_; /** Vector of replica's addresses * Since replicas can have multiple receiver shards, we use a two-dimensional * vector. * * replicaAddrs_[i] records the addresses of replica-i, which can receive * requests replicaAddrs_[i][j] is the address of the jth receiver shard of * replica-i. */ std::vector> replicaAddrs_; /** * After ForwardRequestTd receives client request, it records the address of * the client, so that later the correspoinding CheckQuorumTd can know which * address should recieve the commit reply. */ ConcurrentMap clientAddrs_; /** * As an optimization, proxies also mantain a cache to record the commit reply * messages for those already-commited requests. In this way, when clients * retry the request which has already been committed, the proxy can direct * resend the reply, instead of adding additional burden to the replicas */ std::vector> committedReplyMap_; std::vector> sendTimeMap_; std::vector> logMap_; public: /** Proxy accept a config file, which contains all the necessary information * to instantiate the object, then it can call Run method * */ Proxy(const std::string& configFile = "../configs/nezha-proxy-config.yaml"); ~Proxy(); void Run(); void Terminate(); /** Tentative */ std::vector> replicaSyncedPoints_; }; } // namespace nezha ================================================ FILE: proxy/proxy_config.h ================================================ #include #include #include #include #include struct ProxyConfig { int proxyId; std::string proxyIp; int proxyShardNum; uint32_t proxyMaxOwd; int proxyRequestPortBase; int proxyReplyPortBase; std::vector replicaIps; uint32_t replicaInitialOwd; int replicaReceiverPort; int replicaReceiverShards; // Parses yaml file configFilename and fills in fields of ProxyConfig // accordingly. Returns an error message or "" if there are no errors. std::string parseConfig(std::string configFilename) { YAML::Node config; try { config = YAML::LoadFile(configFilename); } catch (const YAML::BadFile& e) { return "Error loading config file:" + e.msg + "."; } LOG(INFO) << "Using config:\n " << config; std::string key; // Keep track of current key for better error messages try { key = "replica-ips"; for (uint32_t i = 0; i < config[key].size(); i++) { replicaIps.push_back(config[key][i].as()); } key = "replica-receiver-shards"; replicaReceiverShards = config[key].as(); key = "replica-initial-owd"; replicaInitialOwd = config[key].as(); key = "replica-receiver-port"; replicaReceiverPort = config[key].as(); key = "proxy-id"; proxyId = config[key].as(); key = "proxy-ip"; proxyIp = config[key].as(); key = "proxy-shard-num"; proxyShardNum = config[key].as(); key = "proxy-max-owd"; proxyMaxOwd = config[key].as(); key = "proxy-request-port-base"; proxyRequestPortBase = config[key].as(); key = "proxy-reply-port-base"; proxyReplyPortBase = config[key].as(); return ""; } catch (const YAML::BadConversion& e) { if (config[key]) { return "Error parsing config field " + key + ": " + e.msg + "."; } else { return "Error parsing config field " + key + ": key not found."; } } catch (const std::exception& e) { return "Error parsing config field " + key + ": " + e.what() + "."; } } }; ================================================ FILE: proxy/proxy_run.cc ================================================ #include "proxy/proxy.h" DEFINE_string(config, "nezhav2/config/nezha-proxy-config-0.yaml", "The config file for the proxy"); nezha::Proxy* proxy = NULL; void Terminate(int para) { proxy->Terminate(); } int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, true); google::InitGoogleLogging(argv[0]); FLAGS_logtostderr = 1; signal(SIGINT, Terminate); proxy = new nezha::Proxy(FLAGS_config); proxy->Run(); delete proxy; } ================================================ FILE: replica/BUILD ================================================ cc_library( name = "replica_config", hdrs = ["replica_config.h"], deps = [ "@com_github_jbeder_yaml_cpp//:yaml-cpp", ], ) cc_library( name = "replica_class", srcs = ["replica.cc"], hdrs = [ "replica.h", ], deps = [ ":replica_config", "//proto:nezha_cc_proto", "//lib:utils", "@com_github_preshing_junction//:libjunction", "@com_github_enki_libev//:libev", "@boost//:uuid", ], ) cc_binary( name = "nezha_replica", srcs = ["replica_run.cc"], deps = [ ":replica_class", ], ) ================================================ FILE: replica/replica.cc ================================================ #include "replica/replica.h" namespace nezha { // #define GJK_DEBUG #ifdef GJK_DEBUG #define ASSERT(x) assert(x) #else #define ASSERT(x) \ {} #endif Replica::Replica(const std::string& configFile, bool isRecovering) : viewId_(0), lastNormalView_(0) { repliedSyncPoint_ = new std::atomic[maxProxyNum_]; for (uint32_t i = 0; i < maxProxyNum_; i++) { repliedSyncPoint_[i] = CONCURRENT_MAP_START_INDEX - 1; } LOG(INFO) << maxProxyNum_ << " proxy replied sync point has been initialized"; lastAskMissedIndexTime_ = 0; lastAskMissedRequestTime_ = 0; syncedLogEntryHead_ = new LogEntry(); syncedLogEntryHead_->logId = CONCURRENT_MAP_START_INDEX - 1; syncedLogEntryHead_->body.deadline = 0; syncedLogEntryHead_->body.reqKey = 0; unSyncedLogEntryHead_ = new LogEntry(); unSyncedLogEntryHead_->logId = CONCURRENT_MAP_START_INDEX - 1; unSyncedLogEntryHead_->body.deadline = 0; unSyncedLogEntryHead_->body.reqKey = 0; // Load Config std::string error = replicaConfig_.parseConfig(configFile); if (error != "") { LOG(ERROR) << "Error loading replica config. " << error << " Exiting"; exit(1); } if (isRecovering) { status_ = ReplicaStatus::RECOVERING; LOG(INFO) << "Recovering ..."; } else { status_ = ReplicaStatus::NORMAL; } LOG(INFO) << "Replica Status " << status_; CreateContext(); LOG(INFO) << "viewId_=" << viewId_ << "\treplicaId=" << replicaId_ << "\treplicaNum=" << replicaNum_ << "\tkeyNum=" << keyNum_; } Replica::~Replica() { status_ = ReplicaStatus::TERMINATED; for (auto& kv : threadPool_) { delete kv.second; VLOG(2) << "Deleted\t" << kv.first; } // TODO: A more elegant way is to reclaim or dump all logs before exit // For now, it is fine because all the memory is freed after the process is // terminated } void Replica::Run() { // Master thread run masterContext_->Register(endPointType_); if (status_ == ReplicaStatus::RECOVERING) { masterContext_->endPoint_->RegisterTimer(crashVectorRequestTimer_); } else if (status_ == ReplicaStatus::NORMAL) { if (!AmLeader()) { masterContext_->endPoint_->RegisterTimer(heartbeatCheckTimer_); } masterContext_->endPoint_->RegisterTimer(periodicSyncTimer_); } // Launch worker threads (based on config) LaunchThreads(); masterContext_->endPoint_->LoopRun(); VLOG(2) << "Break LoopRun"; // Wait until all threads return for (auto& kv : threadPool_) { VLOG(2) << "Joining " << kv.first; kv.second->join(); VLOG(2) << "Join Complete \t" << kv.first; } } void Replica::Terminate() { do { status_ = ReplicaStatus::TERMINATED; waitVar_.notify_all(); // LOG(INFO) << "activeWorkerNum_=" << activeWorkerNum_; } while (activeWorkerNum_ > 0); } void Replica::CreateContext() { endPointType_ = replicaConfig_.endpointType; replicaId_ = replicaConfig_.replicaId; replicaNum_ = replicaConfig_.replicaIps.size(); keyNum_ = replicaConfig_.keyNum; lastReleasedEntryByKeys_.assign(keyNum_, {0ul, 0ul}); // Since ConcurrentMap reserves 0 and 1, log-id starts from from 2 // So these variables are initialized as 2-1=1 maxSyncedLogEntry_ = syncedLogEntryHead_; maxUnSyncedLogEntry_ = unSyncedLogEntryHead_; minUnSyncedLogEntry_ = unSyncedLogEntryHead_; maxSyncedLogEntryByKey_.assign(keyNum_, NULL); maxUnSyncedLogEntryByKey_.assign(keyNum_, NULL); minUnSyncedLogEntryByKey_.assign(keyNum_, NULL); committedLogId_ = CONCURRENT_MAP_START_INDEX - 1; toCommitLogId_ = CONCURRENT_MAP_START_INDEX - 1; // Create master endpoints and context std::string ip = replicaConfig_.replicaIps[replicaId_.load()]; int port = replicaConfig_.masterPort; int monitorPeriodMs = replicaConfig_.monitorPeriodMs; Endpoint* masterEP = CreateEndpoint(endPointType_, ip, port, true); auto masterCallBack = [](MessageHeader* msgHeader, char* msgBuffer, Address* sender, void* ctx) { ((Replica*)ctx)->ReceiveMasterMessage(msgHeader, msgBuffer); }; // Register a timer to monitor replica status Timer* masterMonitorTimer = new Timer( [](void* ctx, void* receiverEP) { if (((Replica*)ctx)->status_ == ReplicaStatus::TERMINATED) { // Master thread will only break its loop when status comes to // TERMINATED ((Endpoint*)receiverEP)->LoopBreak(); } }, monitorPeriodMs, this); masterContext_ = new ReceiverContext(masterEP, this, masterCallBack, masterMonitorTimer); LOG(INFO) << "Master Created"; // Create request-receiver endpoints and context requestContext_.resize(replicaConfig_.receiverShards); for (int i = 0; i < replicaConfig_.receiverShards; i++) { int port = replicaConfig_.receiverPort + i; Endpoint* requestEP = CreateEndpoint(endPointType_, ip, port); // Register a request handler to this endpoint auto requestHandlerFunc = [](MessageHeader* msgHeader, char* msgBuffer, Address* sender, void* ctx) { ((Replica*)ctx)->ReceiveClientRequest(msgHeader, msgBuffer, sender); }; // Register a timer to monitor replica status Timer* requestEPMonitorTimer = new Timer( [](void* ctx, void* receiverEP) { if (((Replica*)ctx)->status_ != ReplicaStatus::NORMAL) { ((Endpoint*)receiverEP)->LoopBreak(); } }, monitorPeriodMs, this); requestContext_[i] = new ReceiverContext( requestEP, this, requestHandlerFunc, requestEPMonitorTimer); } LOG(INFO) << "requestContext_ Created"; // (Leader) Use these endpoints to broadcast indices to followers for (int i = 0; i < replicaConfig_.indexSyncShards; i++) { indexSender_.push_back(new UDPSocketEndpoint()); } indexAcker_ = CreateEndpoint(endPointType_); indexRequester_ = CreateEndpoint(endPointType_); reqRequester_ = CreateEndpoint(endPointType_); for (uint32_t i = 0; i < replicaNum_; i++) { std::string ip = replicaConfig_.replicaIps[i]; int indexPort = replicaConfig_.indexSyncPort; indexReceiver_.push_back(new Address(ip, indexPort)); int indexAskPort = replicaConfig_.indexAskPort; indexAskReceiver_.push_back(new Address(ip, indexAskPort)); int requestAskPort = replicaConfig_.requestAskPort; requestAskReceiver_.push_back(new Address(ip, requestAskPort)); int masterPort = replicaConfig_.masterPort; masterReceiver_.push_back(new Address(ip, masterPort)); } // (Followers:) Create index-sync endpoint to receive indices port = replicaConfig_.indexSyncPort; Endpoint* idxSyncEP = CreateEndpoint(endPointType_, ip, port); // Register a msg handler to this endpoint to handle index sync messages auto idxHandleFunc = [](MessageHeader* msgHeader, char* msgBuffer, Address* sender, void* ctx) { ((Replica*)ctx)->ReceiveIndexSyncMessage(msgHeader, msgBuffer); }; // Register a timer to monitor replica status Timer* idxSyncMonitorTimer = new Timer( [](void* ctx, void* receiverEP) { if (((Replica*)ctx)->status_ != ReplicaStatus::NORMAL) { ((Endpoint*)receiverEP)->LoopBreak(); } }, monitorPeriodMs, this); indexSyncContext_ = new ReceiverContext(idxSyncEP, this, idxHandleFunc, idxSyncMonitorTimer); LOG(INFO) << "indexSyncContext_ Created"; // Create an endpoint to handle others' requests for missed index port = replicaConfig_.indexAskPort; Endpoint* missedIdxEP = CreateEndpoint(endPointType_, ip, port); // Register message handler auto missedIdxHandleFunc = [](MessageHeader* msgHeader, char* msgBuffer, Address* sender, void* ctx) { ((Replica*)ctx)->ReceiveAskMissedIdx(msgHeader, msgBuffer); }; // Register a timer to monitor replica status Timer* missedIdxAckMonitorTimer = new Timer( [](void* ctx, void* receiverEP) { if (((Replica*)ctx)->status_ != ReplicaStatus::NORMAL) { ((Endpoint*)receiverEP)->LoopBreak(); } }, monitorPeriodMs, this); missedIndexAckContext_ = new ReceiverContext( missedIdxEP, this, missedIdxHandleFunc, missedIdxAckMonitorTimer); LOG(INFO) << "missedIndexAckContext_ Created"; // Create an endpoint to handle others' requests for missed req port = replicaConfig_.requestAskPort; Endpoint* missedReqAckEP = CreateEndpoint(endPointType_, ip, port); // Register message handler auto missedReqAckHandleFunc = [](MessageHeader* msgHeader, char* msgBuffer, Address* sender, void* ctx) { ((Replica*)ctx)->ReceiveAskMissedReq(msgHeader, msgBuffer); }; // Register a timer to monitor replica status Timer* missedReqAckMonitorTimer = new Timer( [](void* ctx, void* receiverEP) { if (((Replica*)ctx)->status_ != ReplicaStatus::NORMAL) { ((Endpoint*)receiverEP)->LoopBreak(); } }, monitorPeriodMs, this); missedReqAckContext_ = new ReceiverContext( missedReqAckEP, this, missedReqAckHandleFunc, missedReqAckMonitorTimer); LOG(INFO) << "missedReqAckContext_ Created"; // Create Record Qus and Maps recordMap_.resize(replicaConfig_.recordShards); recordQu_.resize(replicaConfig_.recordShards); // Create track entry for trackThread trackedEntry_.assign(replicaConfig_.trackShards, maxSyncedLogEntry_); // Create reply endpoints int replyShardNum = replicaConfig_.replyShards; for (int i = 0; i < replyShardNum; i++) { fastReplySender_.push_back(CreateEndpoint(endPointType_)); slowReplySender_.push_back(CreateEndpoint(endPointType_)); } // Create reply queues (one queue per fast/slow reply thread) fastReplyQu_.resize(replyShardNum); slowReplyQu_.resize(replyShardNum); // Create CrashVector Context std::vector cvVec(replicaNum_, 0); CrashVectorStruct* cv = new CrashVectorStruct(cvVec, 2); crashVector_.assign(cv->version_, cv); /** Thw related threads using crash vectors are: * (1) master (1 thread) * (2) FastReplyThread(s) (replyShardNum threads) */ crashVectorVecSize_ = 1 + replyShardNum; crashVectorInUse_ = new std::atomic[crashVectorVecSize_]; for (uint32_t i = 0; i < crashVectorVecSize_; i++) { crashVectorInUse_[i] = cv; } // Create other useful timers heartbeatCheckTimer_ = new Timer( [](void* ctx, void* receiverEP) { // Followers use this timer to check leader's heartbeat ((Replica*)ctx)->CheckHeartBeat(); }, monitorPeriodMs, this); indexAskTimer_ = new Timer( [](void* ctx, void* receiverEP) { ((Replica*)ctx)->AskMissedIndex(); }, replicaConfig_.indexAskPeriodMs, this); roundRobinIndexAskIdx_ = 0; // Initially, no missed indices, so we make first > second missedIndices_ = {1, 0}; requestAskTimer_ = new Timer( [](void* ctx, void* receiverEP) { ((Replica*)ctx)->AskMissedRequest(); }, replicaConfig_.requestAskPeriodMs, this); roundRobinRequestAskIdx_ = 0; missedReqKeys_.clear(); viewChangeTimer_ = new Timer( [](void* ctx, void* receiverEP) { ((Replica*)ctx)->BroadcastViewChange(); }, replicaConfig_.viewChangePeriodMs, this); roundRobinProcessIdx_ = 0; periodicSyncTimer_ = new Timer( [](void* ctx, void* receiverEP) { ((Replica*)ctx)->SendSyncStatusReport(); }, replicaConfig_.syncReportPeriodMs, this); requestTrasnferBatch_ = replicaConfig_.requestTransferBatch; indexTransferBatch_ = replicaConfig_.indexTransferBatch; requestKeyTransferBatch_ = replicaConfig_.requestKeyTransferBatch; stateTransferTimer_ = new Timer( [](void* ctx, void* receiverEP) { ((Replica*)ctx)->SendStateTransferRequest(); }, replicaConfig_.stateTransferPeriodMs, this); stateTransferTimeout_ = replicaConfig_.stateTransferTimeoutMs; crashVectorRequestTimer_ = new Timer( [](void* ctx, void* receiverEP) { ((Replica*)ctx)->BroadcastCrashVectorRequest(); }, replicaConfig_.crashVectorRequestPeriodMs, this); recoveryRequestTimer_ = new Timer( [](void* ctx, void* receiverEP) { ((Replica*)ctx)->BroadcastRecoveryRequest(); }, replicaConfig_.recoveryRequestPeriodMs, this); movingPercentile_ = replicaConfig_.movingPercentile; slidingWindowLen_ = replicaConfig_.owdEstimationWindow; // Signal variable for garbage collection (of followers) reclaimTimeout_ = replicaConfig_.reclaimTimeoutMs; safeToClearUnSyncedLogId_ = new std::atomic[replyShardNum + 1]; safeToClearLateBufferLogId_ = CONCURRENT_MAP_START_INDEX - 1; for (int i = 0; i <= replyShardNum; i++) { safeToClearUnSyncedLogId_[i] = CONCURRENT_MAP_START_INDEX - 1; } prepareToClearLateBufferLogId_ = CONCURRENT_MAP_START_INDEX - 1; prepareToClearUnSyncedLogId_ = CONCURRENT_MAP_START_INDEX - 1; } void Replica::ResetContext() { // Clear queues for (uint32_t i = 0; i < fastReplyQu_.size(); i++) { LogEntry* entry; while (fastReplyQu_[i].try_dequeue(entry)) { } while (slowReplyQu_[i].try_dequeue(entry)) { } // Don't worry about memory leakage, the memory pointed by these in-queue // pointers have already been cleaned or will be cleaned according to their // Conucurrent maps } LogEntry* entry; while (processQu_.try_dequeue(entry)) { delete entry; } for (uint32_t i = 0; i < recordQu_.size(); i++) { RequestBody* rb; while (recordQu_[i].try_dequeue(rb)) { delete rb; } } // TODO: Clear LateBuffer // Clear Early Buffer while (earlyBuffer_.empty() == false) { LogEntry* entry = earlyBuffer_.begin()->second; delete entry; earlyBuffer_.erase(earlyBuffer_.begin()); } // Reset lastReleasedEntryByKeys_, no need to care about UnSyncedLogs, because // they are all cleared for (uint32_t key = 0; key < keyNum_; key++) { if (maxSyncedLogEntryByKey_[key]) { lastReleasedEntryByKeys_[key] = { maxSyncedLogEntryByKey_[key]->body.deadline, maxSyncedLogEntryByKey_[key]->body.reqKey}; } else { lastReleasedEntryByKeys_[key] = {0ul, 0ul}; } } // Clear UnSyncedLogs minUnSyncedLogEntry_ = unSyncedLogEntryHead_; maxUnSyncedLogEntry_ = unSyncedLogEntryHead_; minUnSyncedLogEntryByKey_.clear(); maxUnSyncedLogEntryByKey_.clear(); minUnSyncedLogEntryByKey_.assign(keyNum_, NULL); maxUnSyncedLogEntryByKey_.assign(keyNum_, NULL); // Reset Index-Sync related stuff roundRobinIndexAskIdx_ = 0; missedIndices_ = {1, 0}; roundRobinRequestAskIdx_ = 0; missedReqKeys_.clear(); roundRobinProcessIdx_ = 0; pendingIndexSync_.clear(); // Reset stateTransfer related stuff stateTransferIndices_.clear(); viewChangeSet_.clear(); crashVectorReplySet_.clear(); recoveryReplySet_.clear(); syncStatusSet_.clear(); // Reset trackedEntry trackedEntry_.assign(trackedEntry_.size(), maxSyncedLogEntry_); // Reset OWD-Calc Related stuff slidingWindow_.clear(); owdSampleNum_.clear(); // Reset Master's timers // No need to worry about other timers: worker thread will unregister their // timers and msg handlers during LoopBreak masterContext_->endPoint_->UnRegisterAllTimers(); masterContext_->endPoint_->RegisterTimer(masterContext_->monitorTimer_); if (!AmLeader()) { // Start checking leader's heartbeat from now on lastHeartBeatTime_ = GetMicrosecondTimestamp(); masterContext_->endPoint_->RegisterTimer(heartbeatCheckTimer_); } masterContext_->endPoint_->RegisterTimer(periodicSyncTimer_); // Reset signal variable for garbage collection (of followers) safeToClearLateBufferLogId_ = CONCURRENT_MAP_START_INDEX - 1; for (uint32_t i = 0; i <= fastReplyQu_.size(); i++) { // The number of such counters is number of FastReplyThread_ + 1 (IndexRecv) safeToClearUnSyncedLogId_[i] = CONCURRENT_MAP_START_INDEX - 1; } prepareToClearLateBufferLogId_ = CONCURRENT_MAP_START_INDEX - 1; prepareToClearUnSyncedLogId_ = CONCURRENT_MAP_START_INDEX - 1; } void Replica::LaunchThreads() { activeWorkerNum_ = 0; // Dynamic variable, used as semaphore totalWorkerNum_ = 0; // Static variable to count number of workers // RequestReceive for (int i = 0; i < replicaConfig_.receiverShards; i++) { totalWorkerNum_++; std::thread* td = new std::thread(&Replica::ReceiveThread, this, i); std::string key("ReceiveThread-" + std::to_string(i)); threadPool_[key] = td; LOG(INFO) << "Launched " << key << "\t" << td->native_handle(); } // RequestRecord for (int i = 0; i < replicaConfig_.recordShards; i++) { totalWorkerNum_++; std::thread* td = new std::thread(&Replica::RecordThread, this, i); std::string key("RecordThread-" + std::to_string(i)); threadPool_[key] = td; LOG(INFO) << "Launched " << key << "\t" << td->native_handle(); } // RequestProcess if (replicaConfig_.processShards != 1) { LOG(ERROR) << "ProcessThread parallelization is not supported. " "replicaConfig_->processShards must be 1."; exit(1); } for (int i = 0; i < replicaConfig_.processShards; i++) { totalWorkerNum_++; std::thread* td = new std::thread(&Replica::ProcessThread, this, i); std::string key("ProcessThread-" + std::to_string(i)); threadPool_[key] = td; LOG(INFO) << "Launched " << key << "\t" << td->native_handle(); } // RequestReply int replyShardNum = replicaConfig_.replyShards; for (int i = 0; i < replyShardNum; i++) { totalWorkerNum_++; std::thread* td = new std::thread(&Replica::FastReplyThread, this, i, i + 1); std::string key("FastReplyThread-" + std::to_string(i)); threadPool_[key] = td; LOG(INFO) << "Launched " << key << "\t" << td->native_handle(); } for (int i = 0; i < replyShardNum; i++) { totalWorkerNum_++; std::thread* td = new std::thread(&Replica::SlowReplyThread, this, i); std::string key("SlowReplyThread-" + std::to_string(i)); threadPool_[key] = td; LOG(INFO) << "Launched " << key << "\t" << td->native_handle(); } // Track for (int i = 0; i < replicaConfig_.trackShards; i++) { totalWorkerNum_++; std::thread* td = new std::thread(&Replica::TrackThread, this, i); std::string key("TrackThread-" + std::to_string(i)); threadPool_[key] = td; LOG(INFO) << "Launched " << key << "\t" << td->native_handle(); } // IndexSync for (int i = 0; i < replicaConfig_.indexSyncShards; i++) { totalWorkerNum_++; std::thread* td = new std::thread(&Replica::IndexSendThread, this, i, i + replyShardNum + 1); std::string key("IndexSendThread-" + std::to_string(i)); threadPool_[key] = td; LOG(INFO) << "Launched " << key << "\t" << td->native_handle(); if (!AmLeader()) { // follower only needs one sync thread break; } } totalWorkerNum_++; threadPool_["IndexRecvThread"] = new std::thread(&Replica::IndexRecvThread, this); LOG(INFO) << "Launched IndexRecvThread\t" << threadPool_["IndexRecvThread"]->native_handle(); totalWorkerNum_++; threadPool_["IndexProcessThread"] = new std::thread(&Replica::IndexProcessThread, this); LOG(INFO) << "Launched IndexProcessThread\t" << threadPool_["IndexProcessThread"]->native_handle(); totalWorkerNum_++; threadPool_["MissedIndexAckThread"] = new std::thread(&Replica::MissedIndexAckThread, this); LOG(INFO) << "Launched MissedIndexAckThread\t" << threadPool_["MissedIndexAckThread"]->native_handle(); totalWorkerNum_++; threadPool_["MissedReqAckThread"] = new std::thread(&Replica::MissedReqAckThread, this); LOG(INFO) << "Launched MissedReqAckThread\t" << threadPool_["MissedReqAckThread"]->native_handle(); // totalWorkerNum_++; // threadPool_["GarbageCollectThread"] = // new std::thread(&Replica::GarbageCollectThread, this); // LOG(INFO) << "Launch GarbageCollectThread " // << threadPool_["GarbageCollectThread"]->native_handle(); totalWorkerNum_++; threadPool_["OWDCalcThread"] = new std::thread(&Replica::OWDCalcThread, this); LOG(INFO) << "Launch OWDCalcThread " << threadPool_["OWDCalcThread"]->native_handle(); // totalWorkerNum_++; // threadPool_["LogHash"] = new std::thread(&Replica::LogHash, this); // LOG(INFO) << "Launched IndexRecvThread\t" // << threadPool_["LogHash"]->native_handle(); LOG(INFO) << "Master Thread " << pthread_self(); LOG(INFO) << "totalWorkerNum_=" << totalWorkerNum_; } void Replica::ReceiveClientRequest(MessageHeader* msgHdr, char* msgBuffer, Address* sender) { if (msgHdr->msgType == MessageType::CLIENT_REQUEST) { Request request; if (request.ParseFromArray(msgBuffer, msgHdr->msgLen)) { // tagQu_.enqueue(request.tagid()); // Collect OWD sample uint64_t recvTime = GetMicrosecondTimestamp(); if (recvTime > request.sendtime()) { owdQu_.enqueue(std::pair( request.proxyid(), GetMicrosecondTimestamp() - request.sendtime())); } if (proxyAddressMap_.get(request.proxyid()) == 0) { Address* addr = new Address(*sender); /** When one proxy sends the request, it needs to specify a proper **unique* proxyid related to one specific receiver thread on the *replica, so that this replica's different receiver threads will not *insert the same entry concurrently (otherwise, it may cause memory *leakage) * * In our proxy Implemention, each proxy machine has a unique id, with multiple shard. The machine-id concats shard-id becomes a unqiue *proxy-id, modulo replica-shard-num and then send to the replica *receiver **/ proxyAddressMap_.assign(request.proxyid(), addr); } uint64_t reqKey = CONCAT_UINT32(request.clientid(), request.reqid()); uint64_t deadline = request.sendtime() + request.bound(); RequestBody* rb = new RequestBody(deadline, reqKey, request.key(), request.proxyid(), request.command(), request.iswrite()); uint32_t quId = (reqKey) % recordQu_.size(); recordQu_[quId].enqueue(rb); } else { LOG(WARNING) << "Parse request fail"; } } else { LOG(WARNING) << "Invalid Message Type " << (uint32_t)(msgHdr->msgType); } } void Replica::BlockWhenStatusIsNot(char targetStatus) { if (status_ != targetStatus) { activeWorkerNum_.fetch_sub(1); std::unique_lock lk(waitMutext_); waitVar_.wait(lk, [this, targetStatus] { if (status_ == ReplicaStatus::TERMINATED || status_ == targetStatus) { // Unblock activeWorkerNum_.fetch_add(1); return true; } else { return false; } }); } } void Replica::OWDCalcThread() { activeWorkerNum_.fetch_add(1); std::pair owdSample; // uint32_t logCnt = 0; while (status_ != ReplicaStatus::TERMINATED) { BlockWhenStatusIsNot(ReplicaStatus::NORMAL); while (owdQu_.try_dequeue(owdSample)) { uint64_t proxyId = owdSample.first; uint32_t owd = owdSample.second; owdSampleNum_[proxyId]++; if (slidingWindow_[proxyId].size() < slidingWindowLen_) { slidingWindow_[proxyId].push_back(owd); } else { slidingWindow_[proxyId][owdSampleNum_[proxyId] % slidingWindowLen_] = owd; } if (owdSampleNum_[proxyId] >= slidingWindowLen_) { std::vector tmpSamples(slidingWindow_[proxyId]); sort(tmpSamples.begin(), tmpSamples.end()); uint32_t movingEstimate = tmpSamples[slidingWindowLen_ * movingPercentile_]; owdMap_.assign(proxyId, movingEstimate); } } // reduce CPU cost nanosleep((const struct timespec[]){{0, 1000000L}}, NULL); } uint32_t preVal = activeWorkerNum_.fetch_sub(1); LOG(INFO) << "OWDCalcThread Terminated: " << preVal - 1 << " worker remaining"; } void Replica::ReceiveThread(int id) { activeWorkerNum_.fetch_add(1); while (status_ != ReplicaStatus::TERMINATED) { BlockWhenStatusIsNot(ReplicaStatus::NORMAL); requestContext_[id]->Register(endPointType_); requestContext_[id]->endPoint_->LoopRun(); } uint32_t preVal = activeWorkerNum_.fetch_sub(1); LOG(INFO) << "ReceiveThread Terminated:" << preVal - 1 << " worker remaining"; } void Replica::RecordThread(int id) { activeWorkerNum_.fetch_add(1); RequestBody* rb; // uint64_t sta, ed, cnt; // cnt = 0; while (status_ != ReplicaStatus::TERMINATED) { BlockWhenStatusIsNot(ReplicaStatus::NORMAL); if (recordQu_[id].try_dequeue(rb)) { // cnt++; // if (cnt == 1) { // sta = GetMicrosecondTimestamp(); // } // if (cnt % 100000 == 0) { // ed = GetMicrosecondTimestamp(); // float rate = 100000.0 / ((ed - sta) * 1e-6); // sta = ed; // LOG(INFO) << "id=" << id << " record rate = " << rate << "\t" // << "recordQuLen=" << recordQu_[id].size_approx() << "\t" // << "processQuLen=" << processQu_.size_approx() << "\t" // << "gap sample =" << ed - rb->deadline // << " \t deadline=" << rb->deadline; // } /** The map is sharded by reqKey */ LogEntry* duplicate = recordMap_[id].get(rb->reqKey); if (duplicate == NULL) { SHA_HASH dummy; LogEntry* newEntry = new LogEntry(*rb, dummy, dummy); recordMap_[id].assign(rb->reqKey, newEntry); processQu_.enqueue(newEntry); } else { // Duplicate requests processQu_.enqueue(duplicate); } delete rb; } } uint32_t preVal = activeWorkerNum_.fetch_sub(1); LOG(INFO) << "RecordThread-" << id << " Terminated: " << preVal - 1 << " worker remaining"; } void Replica::TrackThread(int id) { activeWorkerNum_.fetch_add(1); while (status_ != ReplicaStatus::TERMINATED) { BlockWhenStatusIsNot(ReplicaStatus::NORMAL); if (trackedEntry_[id]->next) { LogEntry* next = trackedEntry_[id]->next; // LOG(INFO) << "next logId = " << next->logId; if (next->logId % trackedEntry_.size() == (uint32_t)id) { if (trackedEntry_[id]->logId >= CONCURRENT_MAP_START_INDEX) { uint32_t a = trackedEntry_[id]->logId; uint32_t b = next->logId; if (a + trackedEntry_.size() != b) { LOG(ERROR) << "myId = " << trackedEntry_[id]->logId << "\t" << "sz = " << trackedEntry_.size() << "\t" << "next=" << next->logId << "\t" << trackedEntry_[id]->logId + trackedEntry_.size() << "\t" << (trackedEntry_[id]->logId + trackedEntry_.size() != next->logId) << "\t" << "a=" << a << "\t" << "b=" << b; } ASSERT(trackedEntry_[id]->logId + trackedEntry_.size() == next->logId); } syncedLogEntryByLogId_.assign(next->logId, next); syncedLogEntryByReqKey_.assign(next->body.reqKey, next); } trackedEntry_[id] = next; } if (status_ == ReplicaStatus::TERMINATED) { LOG(INFO) << "Track Thread terminate "; } } uint32_t preVal = activeWorkerNum_.fetch_sub(1); LOG(INFO) << "TrackThread-" << id << " Terminated: " << preVal - 1 << " worker remaining"; } void Replica::ProcessThread(int id) { activeWorkerNum_.fetch_add(1); LogEntry* entry; std::set tags; while (status_ != ReplicaStatus::TERMINATED) { BlockWhenStatusIsNot(ReplicaStatus::NORMAL); bool amLeader = AmLeader(); if (processQu_.try_dequeue(entry)) { if (entry->status == EntryStatus::INITIAL) { std::pair earlyBufferRank(entry->body.deadline, entry->body.reqKey); if (earlyBufferRank > lastReleasedEntryByKeys_[entry->body.opKey]) { earlyBuffer_[earlyBufferRank] = entry; entry->status = EntryStatus::IN_PROCESS; } else { // LOG(INFO) <<"Abnormal "<body.opKey // <<"\t<"<\t" // <<"\t<"<body.opKey].first // <<","<body.opKey].second // <<">"; // This entry cannot enter early buffer if (amLeader) { // Leader modifies its deadline entry->body.deadline = lastReleasedEntryByKeys_[entry->body.opKey].first + 1; earlyBufferRank.first = entry->body.deadline; earlyBuffer_[earlyBufferRank] = entry; entry->status = EntryStatus::IN_PROCESS; } else { // Followers leave it in late buffer entry->status = EntryStatus::IN_LATEBUFFER; } } } else if (entry->status == EntryStatus::IN_PROCESS || entry->status == EntryStatus::IN_LATEBUFFER) { continue; } else if (entry->status == EntryStatus::PROCESSED) { uint32_t quId = (entry->body.reqKey) % fastReplyQu_.size(); fastReplyQu_[quId].enqueue(entry); } else if (entry->status == EntryStatus::TO_SLOW_REPLY) { uint32_t quId = (entry->body.reqKey) % slowReplyQu_.size(); slowReplyQu_[quId].enqueue(entry); } else { LOG(WARNING) << "Unexpected Entry Status " << (int)(entry->status); } } // Polling early-buffer uint64_t nowTime = GetMicrosecondTimestamp(); // This while loop is safe because there is only one processThread. // Parallelization of this thread is not supported. while (!earlyBuffer_.empty()) { LogEntry* nextEntry = earlyBuffer_.begin()->second; if (nowTime < nextEntry->body.deadline) { break; } if (nextEntry->body.isWrite) { lastReleasedEntryByKeys_[nextEntry->body.opKey] = earlyBuffer_.begin()->first; } ProcessRequest(nextEntry, amLeader, true, amLeader); earlyBuffer_.erase(earlyBuffer_.begin()); } } uint32_t preVal = activeWorkerNum_.fetch_sub(1); LOG(INFO) << "ProcessThread Terminated: " << preVal - 1 << " worker remaining"; } void Replica::ProcessRequest(LogEntry* entry, const bool isSyncedReq, const bool sendReply, const bool canExecute) { RequestBody& rb = entry->body; // Read Request do not contribute to hash entry->logHash = entry->entryHash = rb.isWrite ? CalculateHash(rb.deadline, rb.reqKey) : SHA_HASH(); std::vector& maxEntryByKey = isSyncedReq ? maxSyncedLogEntryByKey_ : maxUnSyncedLogEntryByKey_; std::atomic& maxEntry = isSyncedReq ? maxSyncedLogEntry_ : maxUnSyncedLogEntry_; // The log id of the previous non-commutative entry in the synced logs entry->prevNonCommutative = maxEntryByKey[rb.opKey]; if (entry->prevNonCommutative) { if (entry->prevNonCommutative->body.isWrite) { entry->prevNonCommutativeWrite = entry->prevNonCommutative; } else { entry->prevNonCommutativeWrite = entry->prevNonCommutative->prevNonCommutativeWrite; } } entry->prev = maxEntry; entry->result = (isSyncedReq && canExecute) ? ApplicationExecute(rb) : ""; if (entry->prevNonCommutativeWrite) { entry->logHash.XOR(entry->prevNonCommutativeWrite->logHash); } ASSERT(entry->prev != NULL); entry->logId = entry->prev->logId + 1; entry->status = EntryStatus::PROCESSED; if (entry->prevNonCommutative) { entry->prevNonCommutative->nextNonCommutative = entry; } if (entry->prevNonCommutativeWrite && rb.isWrite) { entry->prevNonCommutativeWrite->nextNonCommutativeWrite = entry; } if (isSyncedReq == false && minUnSyncedLogEntryByKey_[rb.opKey] == NULL) { minUnSyncedLogEntryByKey_[rb.opKey] = entry; } entry->prev->next = entry; maxEntryByKey[rb.opKey] = entry; maxEntry = entry; if (sendReply) { uint32_t quId = (entry->body.reqKey) % fastReplyQu_.size(); fastReplyQu_[quId].enqueue(entry); } } void Replica::FastReplyThread(int id, int cvId) { activeWorkerNum_.fetch_add(1); Reply reply; reply.set_replytype(MessageType::FAST_REPLY); reply.set_replicaid(replicaId_); CrashVectorStruct* cv = crashVectorInUse_[cvId]; uint32_t replyNum = 0; // uint64_t startTime, endTime; while (status_ != ReplicaStatus::TERMINATED) { BlockWhenStatusIsNot(ReplicaStatus::NORMAL); bool amLeader = AmLeader(); safeToClearUnSyncedLogId_[id].store(prepareToClearUnSyncedLogId_.load()); // Before encoding crashVector into hash, check whether the crashVector // (cv) is the freshest one CrashVectorStruct* masterCV = crashVectorInUse_[0].load(); if (cv->version_ < masterCV->version_) { // My crash vector is stale, update it crashVectorInUse_[cvId] = masterCV; cv = masterCV; } LogEntry* entry = NULL; if (fastReplyQu_[id].try_dequeue(entry)) { reply.set_iswrite(entry->body.isWrite); reply.set_opkey(entry->body.opKey); replyNum++; // if (replyNum % 500000 == 0) { // LOG(INFO) << id << "QuLen=" << fastReplyQu_[id].size_approx(); // } Address* addr = proxyAddressMap_.get(entry->body.proxyId); if (!addr) { // The replica cannot find the address to send reply // This can happen in very trivial edge cases, e.g., // Step 1: This replica misses the entry // Step 2: The other replica gives this replica the missing entry // Step 3: This replica has not received any entries from that proxy, // so it does not have any addr info Step 4: This replica wants to // send reply for this entry LOG(ERROR) << "Cannot find the address of the proxy " << HIGH_32BIT(entry->body.proxyId) << "-" << LOW_32BIT(entry->body.proxyId); continue; } reply.set_view(viewId_); reply.set_clientid(HIGH_32BIT(entry->body.reqKey)); reply.set_reqid(LOW_32BIT(entry->body.reqKey)); reply.set_result(entry->result); // If the owdMap_ does not have the proxyId (i.e. the owd for this // proxyId has not been estimated), it will return 0 (0 happens to be // the dummy value of protobuf, and the proxy will not consider it as an // estimated owd) reply.set_owd(owdMap_.get(entry->body.proxyId)); SHA_HASH hash(entry->logHash); hash.XOR(cv->cvHash_); if (amLeader) { // Leader's logic is very easy: after XORing the crashVector and the // log entry hash together, it can directly reply reply.set_hash(hash.hash, SHA_DIGEST_LENGTH); reply.set_logid(entry->logId); reply.set_maxsyncedlogid(maxSyncedLogEntry_.load()->logId); uint32_t proxyMachineId = HIGH_32BIT(entry->body.proxyId); repliedSyncPoint_[proxyMachineId] = reply.maxsyncedlogid(); fastReplySender_[id]->SendMsgTo(*addr, reply, MessageType::FAST_REPLY); // replyLogQu_.enqueue(reply); // LOG(INFO) << "Leader reply=" << reply.reqid() << "\t" // << "opKey=" << entry->opKey << "\t" // << "hash=" << hash.toString(); } else { // But follower's hash is a bit complicated, because it needs to // consider both synced entries and unsynced entries, i.e. We need to // (1) eliminate the part to the left of sync-point and (2) use the // remaining part (to the right of sync-point) to XOR the part that // has already been synced // Let's first get the boundary, i.e. minUnSyncedLogId_ and // maxSyncedLogId_ maxSynced is always updated earlier than // minUnSynced, so we first get minUnSynced, and then get maxSynced, // this ensures minUnSynced is no fresher than maxSynced By contrast, // if we get the two variables in the reverse order, then we cannot be // sure which variable is fresher, that can lead to the missing of // some entries during hash calculation LogEntry* unsyncedEntry = minUnSyncedLogEntryByKey_[entry->body.opKey]; LogEntry* syncedEntry = maxSyncedLogEntryByKey_[entry->body.opKey]; if (syncedEntry && syncedEntry->body.isWrite == false) { // Only Write matters syncedEntry = syncedEntry->prevNonCommutativeWrite; assert(syncedEntry == NULL || syncedEntry->body.isWrite); } if (syncedEntry == NULL) { // The index sync process may have not been started, or may have not // catch up; Or the unsynced logs have been reclaimed by // GarbageCollectionThread (we have advanced // safeToClearUnSyncedLogId_) We cannot decide the sync-point, so // we directly reply with the XORed hash (similar to the leader) reply.set_hash(hash.hash, SHA_DIGEST_LENGTH); reply.set_maxsyncedlogid(maxSyncedLogEntry_.load()->logId); uint32_t proxyMachineId = HIGH_32BIT(entry->body.proxyId); repliedSyncPoint_[proxyMachineId] = reply.maxsyncedlogid(); fastReplySender_[id]->SendMsgTo(*addr, reply, MessageType::FAST_REPLY); // replyLogQu_.enqueue(reply); } else { // The follower already gets some synced non-commutative logs (via // index sync process) // Log entries up to syncedEntry are all synced // syncedEntry->hash represents them if (entry->LessOrEqual(*syncedEntry)) { // No need to send fast replies, because this entry has already // been covered by index sync process, just give it a dummy reply, // which includes the max-synced-log-id uint32_t proxyMachineId = HIGH_32BIT(entry->body.proxyId); if (repliedSyncPoint_[proxyMachineId] < maxSyncedLogEntry_.load()->logId) { reply.set_clientid(0); reply.set_reqid(0); reply.set_logid(0); reply.set_maxsyncedlogid(maxSyncedLogEntry_.load()->logId); fastReplySender_[id]->SendMsgTo(*addr, reply, MessageType::FAST_REPLY); } } else { // Beyond syncedEntry, we need to find the boundary in the unsynced // logs // TODO: Check the following // Since unsyncedLogId is no fresher (maybe older) than syncedLogId, // then unsyncedEntry may have already been surpasssed by // syncedEntry, we need to remove the (potential) overlap while (unsyncedEntry->LessOrEqual(*syncedEntry)) { if (unsyncedEntry->body.isWrite) { if (unsyncedEntry->nextNonCommutative) { unsyncedEntry = unsyncedEntry->nextNonCommutative; } else { break; } } else { if (unsyncedEntry->nextNonCommutative) { unsyncedEntry = unsyncedEntry->nextNonCommutative; } else { break; } } } // LogStruct log; // log.originalHash = hash.toString(); // hash encodes all the (unsynced) entries up to entry hash.XOR(unsyncedEntry->logHash); // Remove all previous hash // before unsyncedEntry [included] // log.unsynced = unsyncedEntry; // log.addback = false; if (syncedEntry->LessThan(*unsyncedEntry)) { // add itself back (read request is 0) hash.XOR(unsyncedEntry->entryHash); // log.addback = true; } // Now hash only encodes [unsyncedEntry, entry] // Let's add the synced part // log.synced = syncedEntry; hash.XOR(syncedEntry->logHash); // log.finalE = entry; // entryQu_.enqueue(log); reply.set_hash(hash.hash, SHA_DIGEST_LENGTH); reply.set_maxsyncedlogid(maxSyncedLogEntry_.load()->logId); uint32_t proxyMachineId = HIGH_32BIT(entry->body.proxyId); repliedSyncPoint_[proxyMachineId] = reply.maxsyncedlogid(); fastReplySender_[id]->SendMsgTo(*addr, reply, MessageType::FAST_REPLY); // replyLogQu_.enqueue(reply); } } } } } uint32_t preVal = activeWorkerNum_.fetch_sub(1); LOG(INFO) << "Fast Reply Terminated " << preVal - 1 << " worker remaining"; } void Replica::SlowReplyThread(int id) { activeWorkerNum_.fetch_add(1); Reply reply; reply.set_replicaid(replicaId_); reply.set_hash(""); // uint32_t replyNum = 0; // uint64_t startTime, endTime; while (status_ != ReplicaStatus::TERMINATED) { BlockWhenStatusIsNot(ReplicaStatus::NORMAL); if (AmLeader()) { // Leader does not send slow replies nanosleep((const struct timespec[]){{0, 1000000L}}, NULL); continue; } LogEntry* entry = NULL; if (slowReplyQu_[id].try_dequeue(entry)) { uint32_t logId = entry->logId; reply.set_view(viewId_); reply.set_clientid((entry->body.reqKey) >> 32); reply.set_reqid((uint32_t)(entry->body.reqKey)); // Optimize: SLOW_REPLY => COMMIT_REPLY if (logId <= committedLogId_) { reply.set_replytype(MessageType::COMMIT_REPLY); reply.set_result(entry->result); } else { reply.set_replytype(MessageType::SLOW_REPLY); reply.set_result(""); } reply.set_owd(owdMap_.get(entry->body.proxyId)); reply.set_maxsyncedlogid(maxSyncedLogEntry_.load()->logId); Address* addr = proxyAddressMap_.get(entry->body.proxyId); if (addr) { slowReplySender_[id]->SendMsgTo(*addr, reply, MessageType::SLOW_REPLY); } // replyNum++; // if (replyNum == 1) { // startTime = GetMicrosecondTimestamp(); // } else if (replyNum % 100000 == 0) { // endTime = GetMicrosecondTimestamp(); // float rate = 100000 / ((endTime - startTime) * 1e-6); // LOG(INFO) << "id=" << id << "\t Slow Reply Rate=" << rate // << "\t QuLen=" << slowReplyQu_[id].size_approx() << "\t" // << "pendingIndexSync_ qu =" << pendingIndexSync_.size(); // startTime = endTime; // } } } uint32_t preVal = activeWorkerNum_.fetch_sub(1); LOG(INFO) << "SlowReplyThread Terminated " << preVal - 1 << " worker remaining "; } void Replica::IndexSendThread(int id, int cvId) { activeWorkerNum_.fetch_add(1); LogEntry* lastSyncedEntry = syncedLogEntryHead_; IndexSync indexSyncMsg; uint32_t syncPeriod = replicaConfig_.indexSyncPeriodUs; struct timespec sleepIntval({0, syncPeriod * 1000}); while (status_ != ReplicaStatus::TERMINATED) { BlockWhenStatusIsNot(ReplicaStatus::NORMAL); if (!AmLeader()) { // Although this replica is not leader currently, // we still keep this thread. When it becomes the leader // we can immediately use the thread instead of launching extra threads // (slowly) nanosleep((const struct timespec[]){{0, 1000000L}}, NULL); continue; } if (maxSyncedLogEntry_ == NULL) { continue; } // (1) Leader has some indices to sync // (2) There is noting to send, but we still send an indexSync msg every // 10ms (to serve as leader's heartbeat) indexSyncMsg.set_view(viewId_); indexSyncMsg.set_logidbegin(lastSyncedEntry->logId + 1); uint32_t logEnd = maxSyncedLogEntry_.load()->logId; logEnd = std::min(indexSyncMsg.logidbegin() + indexTransferBatch_, logEnd); indexSyncMsg.set_logidend(logEnd); indexSyncMsg.clear_deadlines(); indexSyncMsg.clear_reqkeys(); for (uint32_t i = indexSyncMsg.logidbegin(); i <= indexSyncMsg.logidend(); i++) { LogEntry* entry = lastSyncedEntry->next; ASSERT(entry != NULL); ASSERT(entry->logId == i); indexSyncMsg.add_deadlines(entry->body.deadline); indexSyncMsg.add_reqkeys(entry->body.reqKey); lastSyncedEntry = entry; } indexSyncMsg.set_sendtime(GetMicrosecondTimestamp()); // Send to all followers for (uint32_t r = 0; r < replicaNum_; r++) { if (r != replicaId_) { indexSender_[id]->SendMsgTo(*(indexReceiver_[r]), indexSyncMsg, MessageType::SYNC_INDEX); } } nanosleep(&sleepIntval, NULL); } uint32_t preVal = activeWorkerNum_.fetch_sub(1); LOG(INFO) << "IndexSendThread Terminated " << preVal - 1 << " worker remaining"; } void Replica::IndexRecvThread() { activeWorkerNum_.fetch_add(1); while (status_ != ReplicaStatus::TERMINATED) { BlockWhenStatusIsNot(ReplicaStatus::NORMAL); indexSyncContext_->Register(endPointType_); indexSyncContext_->endPoint_->LoopRun(); } uint32_t preVal = activeWorkerNum_.fetch_sub(1); LOG(INFO) << "IndexRecvThread Terminated " << preVal - 1 << " worker remaining"; } void Replica::ReceiveIndexSyncMessage(MessageHeader* msgHdr, char* msgBuffer) { // Promise to the GarbageCollectThread, that I will not use the data before // safeToClearLateBufferLogId_ and safeToClearUnSyncedLogId_, so that // GarbageCollectThread can safely reclaim them safeToClearLateBufferLogId_.store(prepareToClearLateBufferLogId_.load()); safeToClearUnSyncedLogId_[fastReplyQu_.size()].store( prepareToClearUnSyncedLogId_.load()); MessageHeader* newMsgHdr = new MessageHeader(msgHdr->msgType, msgHdr->msgLen); char* newBuffer = new char[msgHdr->msgLen]; memcpy(newBuffer, msgBuffer, msgHdr->msgLen); indexQu_.enqueue({newMsgHdr, newBuffer}); } void Replica::IndexProcessThread() { activeWorkerNum_.fetch_add(1); std::pair ele; while (status_ != ReplicaStatus::TERMINATED) { BlockWhenStatusIsNot(ReplicaStatus::NORMAL); while (indexQu_.try_dequeue(ele)) { MessageHeader* msgHdr = ele.first; char* msgBuffer = ele.second; if (msgHdr->msgType == MessageType::SYNC_INDEX) { IndexSync idxSyncMsg; if (idxSyncMsg.ParseFromArray(msgBuffer, msgHdr->msgLen)) { if (!CheckView(idxSyncMsg.view(), false)) { delete msgHdr; delete[] msgBuffer; break; } lastHeartBeatTime_ = GetMicrosecondTimestamp(); if (idxSyncMsg.logidbegin() > idxSyncMsg.logidend()) { // Pure heart beat continue; } if (idxSyncMsg.logidend() > maxSyncedLogEntry_.load()->logId) { std::pair key(idxSyncMsg.logidbegin(), idxSyncMsg.logidend()); pendingIndexSync_[key] = idxSyncMsg; } // Process pendingIndexSync, if any while (!pendingIndexSync_.empty()) { if (ProcessIndexSync(pendingIndexSync_.begin()->second)) { pendingIndexSync_.erase(pendingIndexSync_.begin()); } else { break; } } } } else if (msgHdr->msgType == MessageType::MISSED_REQ) { MissedReq missedReqMsg; if (missedReqMsg.ParseFromArray(msgBuffer, msgHdr->msgLen)) { for (int i = 0; i < missedReqMsg.reqs().size(); i++) { const RequestBodyMsg& rbMsg = missedReqMsg.reqs(i); if (missedReqKeys_.find(rbMsg.reqkey()) != missedReqKeys_.end()) { RequestBody* rb = new RequestBody( rbMsg.deadline(), rbMsg.reqkey(), rbMsg.key(), rbMsg.proxyid(), rbMsg.command(), rbMsg.iswrite()); // We must handle it to ProcessThread instead of processing it // here, to avoid data race (and further memroy leakage), although // it is a trivial possibility uint32_t quId = rbMsg.reqkey() % recordQu_.size(); recordQu_[quId].enqueue(rb); missedReqKeys_.erase(rbMsg.reqkey()); fetchTime_.push_back(GetMicrosecondTimestamp() - askTimebyReqKey_[rbMsg.reqkey()]); askTimebyReqKey_.erase(rbMsg.reqkey()); } } } } else { LOG(WARNING) << "Unexpected msg type " << (int)(msgHdr->msgType); } delete msgHdr; delete[] msgBuffer; } } uint32_t preVal = activeWorkerNum_.fetch_sub(1); LOG(INFO) << "IndexProcessThread Terminated: " << preVal - 1 << " worker remaining"; } bool Replica::ProcessIndexSync(const IndexSync& idxSyncMsg) { uint32_t maxSyncedLogId = maxSyncedLogEntry_.load()->logId; if (idxSyncMsg.logidend() <= maxSyncedLogId) { // This idxSyncMsg is useless return true; } if (idxSyncMsg.logidbegin() > maxSyncedLogId + 1) { // Missing some indices missedIndices_ = {maxSyncedLogId + 1, idxSyncMsg.logidbegin() - 1}; AskMissedIndex(); return false; } // Coming here means, no index is missing if (indexSyncContext_->endPoint_->isTimerRegistered(indexAskTimer_)) { indexSyncContext_->endPoint_->UnRegisterTimer(indexAskTimer_); } for (uint32_t logId = maxSyncedLogId + 1; logId <= idxSyncMsg.logidend(); logId++) { uint32_t offset = logId - idxSyncMsg.logidbegin(); uint64_t reqKey = idxSyncMsg.reqkeys(offset); uint64_t deadline = idxSyncMsg.deadlines(offset); uint32_t quId = reqKey % recordMap_.size(); LogEntry* entry = recordMap_[quId].get(reqKey); if (entry && missedReqKeys_.empty()) { SHA_HASH myHash; SHA_HASH hash; if (entry->body.isWrite) { myHash = CalculateHash(deadline, reqKey); hash = myHash; } LogEntry* prevNonCommutative = maxSyncedLogEntryByKey_[entry->body.opKey]; LogEntry* prevNonCommutativeWrite = NULL; if (prevNonCommutative) { if (prevNonCommutative->body.isWrite) { prevNonCommutativeWrite = prevNonCommutative; } else { prevNonCommutativeWrite = prevNonCommutative->prevNonCommutativeWrite; } } assert(prevNonCommutativeWrite == NULL || prevNonCommutativeWrite->body.isWrite); if (prevNonCommutativeWrite) { // This request has some pre non-commutative ones // In that way, XOR the previous accumulated hash hash.XOR(prevNonCommutativeWrite->logHash); } LogEntry* newEntry = new LogEntry(entry->body, myHash, hash, prevNonCommutative, NULL, prevNonCommutativeWrite, NULL, maxSyncedLogEntry_, NULL); newEntry->status = EntryStatus::TO_SLOW_REPLY; newEntry->logId = logId; ASSERT(logId == maxSyncedLogEntry_.load()->logId + 1); maxSyncedLogEntry_.load()->next = newEntry; if (prevNonCommutative) { prevNonCommutative->nextNonCommutative = newEntry; } if (newEntry->body.isWrite && prevNonCommutativeWrite) { prevNonCommutativeWrite->nextNonCommutativeWrite = newEntry; } // uint32_t prevMaxLogId = maxSyncedLogEntry_.load()->logId; maxSyncedLogEntry_ = newEntry; ASSERT(maxSyncedLogEntry_.load()->logId == logId); ASSERT(prevMaxLogId + 1 == logId); maxSyncedLogEntryByKey_[newEntry->body.opKey] = newEntry; uint32_t quId = (newEntry->body.reqKey) % slowReplyQu_.size(); slowReplyQu_[quId].enqueue(newEntry); ASSERT(newEntry->prev->logId + 1 == newEntry->logId); // TODO: Think about the order above // Chunk UnSynced logs if (minUnSyncedLogEntryByKey_[newEntry->body.opKey]) { // Try to advance minUnSyncedLogIdByKey_[opKey] LogEntry* unSyncedEntry = minUnSyncedLogEntryByKey_[newEntry->body.opKey]; while (unSyncedEntry->LessOrEqual(*entry)) { if (unSyncedEntry->body.isWrite) { if (unSyncedEntry->nextNonCommutativeWrite) { unSyncedEntry = unSyncedEntry->nextNonCommutativeWrite; } else { break; } } else { if (unSyncedEntry->nextNonCommutative) { unSyncedEntry = unSyncedEntry->nextNonCommutative; } else { break; } } } minUnSyncedLogEntryByKey_[newEntry->body.opKey] = unSyncedEntry; } } else { missedReqKeys_.insert(reqKey); } } if (missedReqKeys_.empty()) { return true; } else { AskMissedRequest(); return false; } } void Replica::MissedIndexAckThread() { activeWorkerNum_.fetch_add(1); while (status_ != ReplicaStatus::TERMINATED) { BlockWhenStatusIsNot(ReplicaStatus::NORMAL); missedIndexAckContext_->Register(endPointType_); missedIndexAckContext_->endPoint_->LoopRun(); } uint32_t preVal = activeWorkerNum_.fetch_sub(1); LOG(INFO) << "MissedIndexAckThread Terminated " << preVal - 1 << " worker remaining"; } void Replica::ReceiveAskMissedIdx(MessageHeader* msgHdr, char* msgBuffer) { AskIndex askIndex; if (msgHdr->msgType == MessageType::MISSED_INDEX_ASK && askIndex.ParseFromArray(msgBuffer, msgHdr->msgLen)) { uint32_t logBegin = askIndex.logidbegin(); uint32_t logEnd = std::min(maxSyncedLogEntry_.load()->logId, askIndex.logidend()); for (uint32_t i = logBegin; i <= logEnd; i += indexTransferBatch_) { IndexSync indexSyncMsg; indexSyncMsg.set_view(viewId_); indexSyncMsg.set_logidbegin(i); uint32_t end = std::min(i + indexTransferBatch_ - 1, logEnd); indexSyncMsg.set_logidend(end); uint32_t logid = i; LogEntry* entryStart = syncedLogEntryByLogId_.get(logid); if (!entryStart) { // Since the update of syncedLogEntryByLogId_ may lag a bit behind // maxSyncedLogEntry_. entryStart may be NULL. In that case, we // terminate here break; } ASSERT(entryStart->logId == logid); while (entryStart->logId <= end) { indexSyncMsg.add_deadlines(entryStart->body.deadline); indexSyncMsg.add_reqkeys(entryStart->body.reqKey); entryStart = entryStart->next; } indexAcker_->SendMsgTo(*(indexReceiver_[askIndex.replicaid()]), indexSyncMsg, MessageType::SYNC_INDEX); } } } void Replica::MissedReqAckThread() { activeWorkerNum_.fetch_add(1); while (status_ != ReplicaStatus::TERMINATED) { BlockWhenStatusIsNot(ReplicaStatus::NORMAL); missedReqAckContext_->Register(endPointType_); missedReqAckContext_->endPoint_->LoopRun(); } uint32_t preVal = activeWorkerNum_.fetch_sub(1); LOG(INFO) << "MissedReqAckThread Terminated " << preVal - 1 << " worker remaining"; } void Replica::ReceiveAskMissedReq(MessageHeader* msgHdr, char* msgBuffer) { AskReq askReqMsg; if (msgHdr->msgType == MessageType::MISSED_REQ_ASK && askReqMsg.ParseFromArray(msgBuffer, msgHdr->msgLen)) { MissedReq missedReqMsg; missedReqMsg.set_replicaid(this->replicaId_); for (int i = 0; i < askReqMsg.missedreqkeys_size(); i++) { uint64_t reqKey = askReqMsg.missedreqkeys(i); uint32_t quId = reqKey % recordMap_.size(); LogEntry* entry = recordMap_[quId].get(reqKey); if (entry) { RequestBodyToMessage(entry->body, missedReqMsg.add_reqs()); } if ((uint32_t)(missedReqMsg.reqs_size()) >= requestTrasnferBatch_) { missedReqAckContext_->endPoint_->SendMsgTo( *(indexReceiver_[askReqMsg.replicaid()]), missedReqMsg, MessageType::MISSED_REQ); missedReqMsg.clear_reqs(); } } if (missedReqMsg.reqs_size() > 0) { // This ack is useful because it really contains some missed requests, // so send it missedReqAckContext_->endPoint_->SendMsgTo( *(indexReceiver_[askReqMsg.replicaid()]), missedReqMsg, MessageType::MISSED_REQ); } } } void Replica::RequestBodyToMessage(const RequestBody& rb, RequestBodyMsg* rbMsg) { rbMsg->set_deadline(rb.deadline); rbMsg->set_reqkey(rb.reqKey); rbMsg->set_proxyid(rb.proxyId); rbMsg->set_command(rb.command); rbMsg->set_key(rb.opKey); rbMsg->set_iswrite(rb.isWrite); } void Replica::AskMissedIndex() { if (missedIndices_.first > missedIndices_.second) { // indexSyncContext_->endPoint_->UnRegisterTimer(indexAskTimer_); return; } uint64_t nowTime = GetMicrosecondTimestamp(); if (lastAskMissedIndexTime_ + 50 > nowTime) { return; } AskIndex askIndexMsg; askIndexMsg.set_replicaid(this->replicaId_); askIndexMsg.set_logidbegin(missedIndices_.first); askIndexMsg.set_logidend(missedIndices_.second); // roundRobinIndexAskIdx_ = 0;// Debug // Do not ask leader every time, choose random replica to ask to avoid // leader bottleneck indexRequester_->SendMsgTo( *(indexAskReceiver_[roundRobinIndexAskIdx_ % replicaNum_]), askIndexMsg, MessageType::MISSED_INDEX_ASK); roundRobinIndexAskIdx_++; if (roundRobinIndexAskIdx_ % replicaNum_ == replicaId_) { roundRobinIndexAskIdx_++; } lastAskMissedIndexTime_ = GetMicrosecondTimestamp(); } void Replica::AskMissedRequest() { if (missedReqKeys_.empty()) { // no need to start timer return; } uint64_t nowTime = GetMicrosecondTimestamp(); if (lastAskMissedIndexTime_ + 50 > nowTime) { return; } AskReq askReqMsg; askReqMsg.set_replicaid(this->replicaId_); for (const uint64_t& reqKey : missedReqKeys_) { askReqMsg.add_missedreqkeys(reqKey); if ((uint32_t)(askReqMsg.missedreqkeys_size()) >= requestKeyTransferBatch_) { reqRequester_->SendMsgTo( *(requestAskReceiver_[roundRobinRequestAskIdx_ % replicaNum_]), askReqMsg, MessageType::MISSED_REQ_ASK); roundRobinRequestAskIdx_++; if (roundRobinRequestAskIdx_ % replicaNum_ == replicaId_) { roundRobinRequestAskIdx_++; } askReqMsg.clear_missedreqkeys(); } askTimebyReqKey_[reqKey] = GetMicrosecondTimestamp(); } if (askReqMsg.missedreqkeys_size() > 0) { reqRequester_->SendMsgTo(*(requestAskReceiver_[viewId_ % replicaNum_]), askReqMsg, MessageType::MISSED_REQ_ASK); roundRobinRequestAskIdx_++; if (roundRobinRequestAskIdx_ % replicaNum_ == replicaId_) { roundRobinRequestAskIdx_++; } lastAskMissedRequestTime_ = GetMicrosecondTimestamp(); } } void Replica::GarbageCollectThread() { activeWorkerNum_.fetch_add(1); while (status_ != ReplicaStatus::TERMINATED) { BlockWhenStatusIsNot(ReplicaStatus::NORMAL); // Reclaim stale crashVector ReclaimStaleCrashVector(); // Reclaim (unsynced) stale logs ReclaimStaleLogs(); // Check LateBuffer and UnSyncedLog items and try to advance // prepareToClearLateBufferLogId_ and prepareToClearUnSyncedLogId_ PrepareNextReclaim(); } uint32_t preVal = activeWorkerNum_.fetch_sub(1); LOG(INFO) << "GarbageCollectThread Terminated " << preVal - 1 << " worker remaining"; } void Replica::ReclaimStaleCrashVector() { uint32_t masterCVVersion = crashVectorInUse_[0].load()->version_; while (cvVersionToClear_ <= masterCVVersion) { bool canDelete = true; for (uint32_t i = 0; i < crashVectorVecSize_; i++) { if (crashVectorInUse_[i].load()->version_ <= cvVersionToClear_) { canDelete = false; break; } } if (canDelete) { CrashVectorStruct* cvToClear = crashVector_.get(cvVersionToClear_); crashVector_.erase(cvVersionToClear_); delete cvToClear; cvVersionToClear_++; } else { break; } } } void Replica::ReclaimStaleLogs() { uint32_t safePoint = prepareToClearUnSyncedLogId_; for (uint32_t shardIdx = 0; shardIdx < fastReplyQu_.size() + 1; shardIdx++) { safePoint = std::min(safePoint, safeToClearUnSyncedLogId_[shardIdx].load()); } // Reclaim UnSynced Entries // Reclaim Entries in late-buffer safePoint = safeToClearLateBufferLogId_; } void Replica::PrepareNextReclaim() {} void Replica::CheckHeartBeat() { if (status_ == ReplicaStatus::TERMINATED) { masterContext_->endPoint_->LoopBreak(); return; } if (AmLeader()) { return; } if (status_ != ReplicaStatus::NORMAL) { // Some worker threads have detected viewchange and switch status_ to // VIEWCHANGE But workers have no priviledge to increment viewId_ and // initiate view change process, so the master will do that VLOG(2) << "InitiateViewChange-10"; InitiateViewChange(viewId_ + 1); return; } uint64_t nowTime = GetMicrosecondTimestamp(); uint64_t threashold = replicaConfig_.heartbeatThresholdMs * 1000; if (lastHeartBeatTime_ + threashold < nowTime) { // I haven't heard from the leader for too long, it probably has died // Before start view change, clear context VLOG(2) << "InitiateViewChange-1"; InitiateViewChange(viewId_ + 1); } } void Replica::ReceiveMasterMessage(MessageHeader* msgHdr, char* msgBuffer) { VLOG(4) << "msgType " << (uint32_t)(msgHdr->msgType); if (msgHdr->msgType == MessageType::VIEWCHANGE_REQ) { ViewChangeRequest viewChangeReq; if (viewChangeReq.ParseFromArray(msgBuffer, msgHdr->msgLen)) { ProcessViewChangeReq(viewChangeReq); } } else if (msgHdr->msgType == MessageType::VIEWCHANGE_MSG) { ViewChange viewChangeMsg; if (viewChangeMsg.ParseFromArray(msgBuffer, msgHdr->msgLen)) { ProcessViewChange(viewChangeMsg); } } else if (msgHdr->msgType == MessageType::STATE_TRANSFER_REQUEST) { StateTransferRequest stateTransferReq; if (stateTransferReq.ParseFromArray(msgBuffer, msgHdr->msgLen)) { ProcessStateTransferRequest(stateTransferReq); } } else if (msgHdr->msgType == MessageType::STATE_TRANSFER_REPLY) { StateTransferReply stateTransferRep; if (stateTransferRep.ParseFromArray(msgBuffer, msgHdr->msgLen)) { ProcessStateTransferReply(stateTransferRep); } } else if (msgHdr->msgType == MessageType::START_VIEW) { StartView startView; if (startView.ParseFromArray(msgBuffer, msgHdr->msgLen)) { ProcessStartView(startView); } } else if (msgHdr->msgType == MessageType::CRASH_VECTOR_REQUEST) { CrashVectorRequest request; if (request.ParseFromArray(msgBuffer, msgHdr->msgLen)) { ProcessCrashVectorRequest(request); } } else if (msgHdr->msgType == MessageType::CRASH_VECTOR_REPLY) { CrashVectorReply reply; if (reply.ParseFromArray(msgBuffer, msgHdr->msgLen)) { VLOG(2) << "CrashVectorReply = " << reply.DebugString(); ProcessCrashVectorReply(reply); } } else if (msgHdr->msgType == MessageType::RECOVERY_REQUEST) { RecoveryRequest request; if (request.ParseFromArray(msgBuffer, msgHdr->msgLen)) { ProcessRecoveryRequest(request); } } else if (msgHdr->msgType == MessageType::RECOVERY_REPLY) { RecoveryReply reply; if (reply.ParseFromArray(msgBuffer, msgHdr->msgLen)) { ProcessRecoveryReply(reply); } } else if (msgHdr->msgType == MessageType::SYNC_STATUS_REPORT) { SyncStatusReport report; if (report.ParseFromArray(msgBuffer, msgHdr->msgLen)) { ProcessSyncStatusReport(report); } } else if (msgHdr->msgType == MessageType::COMMIT_INSTRUCTION) { CommitInstruction commit; if (commit.ParseFromArray(msgBuffer, msgHdr->msgLen)) { ProcessCommitInstruction(commit); } } else { LOG(WARNING) << "Unexpected message type " << (int)msgBuffer[0]; } } void Replica::SendViewChangeRequest(const int toReplicaId) { ViewChangeRequest viewChangeReq; viewChangeReq.set_view(viewId_); viewChangeReq.set_replicaid(replicaId_); CrashVectorStruct* cv = crashVectorInUse_[0].load(); viewChangeReq.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end()); if (toReplicaId < 0) { // send to all for (uint32_t i = 0; i < replicaNum_; i++) { if (i != replicaId_) { // no need to send to myself masterContext_->endPoint_->SendMsgTo( *(masterReceiver_[i]), viewChangeReq, MessageType::VIEWCHANGE_REQ); } } } else { masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[toReplicaId]), viewChangeReq, MessageType::VIEWCHANGE_REQ); } } void Replica::SendViewChange() { if (AmLeader()) { // I am the leader of this new view, no need to send to myself return; } ViewChange viewChangeMsg; viewChangeMsg.set_view(viewId_); viewChangeMsg.set_replicaid(replicaId_); CrashVectorStruct* cv = crashVectorInUse_[0].load(); viewChangeMsg.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end()); viewChangeMsg.set_syncpoint(maxSyncedLogEntry_.load()->logId); if (filteredUnSyncedEntries_.size() > 1) { viewChangeMsg.set_unsynclogbegin(1); viewChangeMsg.set_unsynclogend(filteredUnSyncedEntries_.size() - 1); } else { viewChangeMsg.set_unsynclogbegin(0); viewChangeMsg.set_unsynclogend(0); } viewChangeMsg.set_lastnormalview(lastNormalView_); masterContext_->endPoint_->SendMsgTo( *(masterReceiver_[viewId_ % replicaNum_]), viewChangeMsg, MessageType::VIEWCHANGE_MSG); } void Replica::InitiateViewChange(const uint32_t view) { if (viewId_ > view) { LOG(ERROR) << "Invalid view change initiation currentView=" << viewId_ << "\ttargetView=" << view; return; } if (viewId_ == view && status_ == ReplicaStatus::VIEWCHANGE) { // Already in viewchange return; } status_ = ReplicaStatus::VIEWCHANGE; LOG(INFO) << "status =" << (int)status_ << "\t" << " view=" << viewId_ << "\t" << " targeting view=" << view; // Wait until every worker stop while (activeWorkerNum_ > 0) { usleep(1000); } /** Since the update of syncedLogEntryByReqKey_ and syncedLogEntryByLogId_ * may have not been completed when they encounter view change, let's first * complete (flush) them */ LogEntry* minTrackedEntry = trackedEntry_[0]; for (uint32_t i = 0; i < trackedEntry_.size(); i++) { if (minTrackedEntry->logId > trackedEntry_[i]->logId) { minTrackedEntry = trackedEntry_[i]; } } while (minTrackedEntry->next) { LogEntry* next = minTrackedEntry->next; if (syncedLogEntryByLogId_.get(next->logId) == NULL) { syncedLogEntryByLogId_.assign(next->logId, next); syncedLogEntryByReqKey_.assign(next->body.reqKey, next); } minTrackedEntry = next; } trackedEntry_.assign(trackedEntry_.size(), minTrackedEntry); LogEntry* entryStart = minUnSyncedLogEntry_; if (entryStart->logId < CONCURRENT_MAP_START_INDEX) { // This is dummy, move to its next; entryStart = entryStart->next; } filteredUnSyncedEntries_.clear(); filteredUnSyncedEntries_.resize( 1); // Reserve 1 slot as dummy value [because 0 has special use] while (entryStart) { LogEntry* entry = syncedLogEntryByReqKey_.get(entryStart->body.reqKey); if (!entry) { // Has not been synced filteredUnSyncedEntries_.push_back(entryStart); } entryStart = entryStart->next; } viewId_ = view; // Unregister all timers, except the monitorTimer (so as the master thread // can break when status=Terminated) masterContext_->endPoint_->UnRegisterAllTimers(); masterContext_->endPoint_->RegisterTimer(masterContext_->monitorTimer_); LOG(INFO) << "Monitor Timer Registered " << "viewId=" << viewId_ << "\t" << "maxSyncedLogId=" << maxSyncedLogEntry_.load()->logId << "\t" << "committedLogId=" << committedLogId_ << "\t" << "filteredUnSyncedEntries_.size()=" << filteredUnSyncedEntries_.size() << "\t" << "currentTime=" << GetMicrosecondTimestamp() << "\t"; // Launch viewChange timer masterContext_->endPoint_->RegisterTimer(viewChangeTimer_); } void Replica::BroadcastViewChange() { if (status_ == ReplicaStatus::NORMAL) { // Can stop the timer masterContext_->endPoint_->UnRegisterTimer(viewChangeTimer_); return; } // Broadcast VIEW-CHANGE-REQ to all replicas SendViewChangeRequest(-1); // Send VIEW-CHANGE to the leader in this view SendViewChange(); } void Replica::SendStartView(const int toReplicaId) { StartView startView; startView.set_replicaid(replicaId_); startView.set_view(viewId_); CrashVectorStruct* cv = crashVectorInUse_[0]; startView.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end()); // startView.set_syncedlogid(maxSyncedLogId_); startView.set_syncedlogid(maxSyncedLogEntry_.load()->logId); if (toReplicaId >= 0) { // send to one masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[toReplicaId]), startView, MessageType::START_VIEW); } else { // send to all for (uint32_t i = 0; i < replicaNum_; i++) { if (i == replicaId_) { // No need to send to self continue; } masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[i]), startView, MessageType::START_VIEW); VLOG(2) << "Send StartView to " << i << "\t" << masterReceiver_[i]->GetIPAsString() << ":" << masterReceiver_[i]->GetPortAsInt(); } } } void Replica::SendSyncStatusReport() { SyncStatusReport report; report.set_view(viewId_); report.set_replicaid(replicaId_); CrashVectorStruct* cv = crashVectorInUse_[0].load(); report.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end()); // report.set_syncedlogid(maxSyncedLogId_); report.set_syncedlogid(maxSyncedLogEntry_.load()->logId); if (AmLeader()) { // leader directly process its own report ProcessSyncStatusReport(report); } else { // send to leader masterContext_->endPoint_->SendMsgTo( *(masterReceiver_[viewId_ % replicaNum_]), report, MessageType::SYNC_STATUS_REPORT); } } void Replica::SendCommit() { CommitInstruction commit; commit.set_view(viewId_); commit.set_replicaid(replicaId_); CrashVectorStruct* cv = crashVectorInUse_[0].load(); commit.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end()); commit.set_committedlogid(committedLogId_); // LOG(INFO) << "commit " << commit.DebugString(); for (uint32_t i = 0; i < replicaNum_; i++) { if (i != replicaId_) { masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[i]), commit, MessageType::COMMIT_INSTRUCTION); } } } void Replica::ProcessViewChangeReq(const ViewChangeRequest& viewChangeReq) { if (status_ == ReplicaStatus::RECOVERING) { // Recovering replicas do not participate in view change return; } if (!CheckCV(viewChangeReq.replicaid(), viewChangeReq.cv())) { // stray message return; } if (Aggregated(viewChangeReq.cv())) { // If cv is updated, then it is likely that some messages in // viewChangeSet_ become stray, so remove them for (uint32_t i = 0; i < replicaNum_; i++) { auto iter = viewChangeSet_.find(i); if (iter != viewChangeSet_.end() && (!CheckCV(i, iter->second.cv()))) { viewChangeSet_.erase(i); } } } if (viewChangeReq.view() > viewId_) { VLOG(2) << "InitiateViewChange-2"; InitiateViewChange(viewChangeReq.view()); } else { if (status_ == ReplicaStatus::NORMAL) { SendStartView(viewChangeReq.replicaid()); } else { SendViewChange(); } } } void Replica::ProcessViewChange(const ViewChange& viewChange) { // LOG(INFO) << "viewChange: " << viewChange.DebugString(); if (status_ == ReplicaStatus::RECOVERING) { // Recovering replicas do not participate in view change return; } if (!CheckCV(viewChange.replicaid(), viewChange.cv())) { // stray message LOG(WARNING) << "Stray Message"; return; } Aggregated(viewChange.cv()); if (status_ == ReplicaStatus::NORMAL) { if (viewChange.view() > viewId_) { VLOG(2) << "InitiateViewChange-3"; InitiateViewChange(viewChange.view()); } else { // The sender lags behind SendStartView(viewChange.replicaid()); } } else if (status_ == ReplicaStatus::VIEWCHANGE) { if (viewChange.view() > viewId_) { VLOG(2) << "InitiateViewChange-4"; InitiateViewChange(viewChange.view()); } else if (viewChange.view() < viewId_) { SendViewChangeRequest(viewChange.replicaid()); } // viewChange.view() == viewId else if (viewChangeSet_.size() >= replicaNum_ / 2 + 1) { // We have got enough valid viewchange messages, no need for this one return; } else { ASSERT(AmLeader()); viewChangeSet_[viewChange.replicaid()] = viewChange; VLOG(3) << "viewChangeSet Size=" << viewChangeSet_.size(); // If cv is updated, then it is likely that some messages in // viewChangeSet_ become stray, so remove them for (uint32_t i = 0; i < replicaNum_; i++) { auto iter = viewChangeSet_.find(i); if (iter != viewChangeSet_.end() && (!CheckCV(i, iter->second.cv()))) { viewChangeSet_.erase(i); } } if (viewChangeSet_.size() >= replicaNum_ / 2) { ASSERT(viewChangeSet_.find(replicaId_) == viewChangeSet_.end()); // Got f viewChange // Plus myself, got f+1 viewChange messages ViewChange myvc; CrashVectorStruct* masterCV = crashVectorInUse_[0].load(); myvc.mutable_cv()->Add(masterCV->cv_.begin(), masterCV->cv_.end()); myvc.set_view(viewId_); myvc.set_replicaid(replicaId_); // myvc.set_syncpoint(maxSyncedLogId_); // myvc.set_unsynclogbegin(minUnSyncedLogId_); // myvc.set_unsynclogend(maxUnSyncedLogId_); myvc.set_syncpoint(maxSyncedLogEntry_.load()->logId); if (filteredUnSyncedEntries_.size() > 1) { myvc.set_unsynclogbegin(1); myvc.set_unsynclogend(filteredUnSyncedEntries_.size() - 1); } else { myvc.set_unsynclogbegin(0); myvc.set_unsynclogend(0); } myvc.set_lastnormalview(lastNormalView_); viewChangeSet_[replicaId_] = myvc; // Has got enough viewChange messages, stop viewChangeTimer masterContext_->endPoint_->UnRegisterTimer(viewChangeTimer_); TransferSyncedLog(); } } } else { LOG(WARNING) << "Unexpected Status " << status_; } } void Replica::TransferSyncedLog() { uint32_t largestNormalView = lastNormalView_; uint32_t maxSyncedLogId = maxSyncedLogEntry_.load()->logId; uint32_t largestSyncPoint = maxSyncedLogId; uint32_t targetReplicaId = replicaId_; transferSyncedEntry_ = true; for (auto& kv : viewChangeSet_) { if (largestNormalView < kv.second.lastnormalview()) { largestNormalView = kv.second.lastnormalview(); } } for (auto& kv : viewChangeSet_) { if (kv.second.lastnormalview() == largestNormalView && largestSyncPoint < kv.second.syncpoint()) { largestSyncPoint = kv.second.syncpoint(); targetReplicaId = kv.second.replicaid(); } } stateTransferIndices_.clear(); VLOG(3) << "maxSyncedLogId_=" << maxSyncedLogId << "\t" << "largestSyncPoint=" << largestSyncPoint << "\t" << "largestNormalView = " << largestNormalView << "\t" << "lastNormalView_=" << lastNormalView_; // Directly copy the synced entries if (largestNormalView == lastNormalView_) { if (maxSyncedLogId < largestSyncPoint) { stateTransferIndices_[targetReplicaId] = {maxSyncedLogId + 1, largestSyncPoint}; } // Else: no need to do state transfer, because this replica has all synced // entries } else { stateTransferIndices_[targetReplicaId] = {committedLogId_ + 1, largestSyncPoint}; } if (!stateTransferIndices_.empty()) { // Start state transfer // After this state transfer has been completed, continue to execute the // callback (MergeUnsyncedLog) stateTransferCallback_ = std::bind(&Replica::TransferUnSyncedLog, this); stateTransferTerminateTime_ = GetMicrosecondTimestamp() + stateTransferTimeout_ * 1000; stateTransferTerminateCallback_ = std::bind(&Replica::RollbackToViewChange, this); LOG(INFO) << "Start state transfer targetReplica " << targetReplicaId << "\t" << "seg=" << stateTransferIndices_[targetReplicaId].first << "\t" << stateTransferIndices_[targetReplicaId].second; // Start the state tranfer timer masterContext_->endPoint_->RegisterTimer(stateTransferTimer_); } else { // Directly go to the second stage: transfer unsynced log TransferUnSyncedLog(); } } void Replica::TransferUnSyncedLog() { // Get the unsynced logs from the f+1 remaining replicas // If this process cannot be completed, rollback to view change uint32_t largestNormalView = lastNormalView_; transferSyncedEntry_ = false; for (auto& kv : viewChangeSet_) { if (largestNormalView < kv.second.lastnormalview()) { largestNormalView = kv.second.lastnormalview(); } } VLOG(3) << "TransferUnSyncedLog largestNormalView=" << largestNormalView; stateTransferIndices_.clear(); for (auto& kv : viewChangeSet_) { if (kv.second.lastnormalview() < largestNormalView) { // No need to transfer log, this guy's unsynced logs do not contribute // to committed logs continue; } if (kv.first == replicaId_) { // No need to transfer log entries from self continue; } if (kv.second.unsynclogbegin() == 0 && kv.second.unsynclogend() == 0) { // This replica has no unsynced logs continue; } // request transfer of the filteredUnSyncedRequests vec stateTransferIndices_[kv.first] = {kv.second.unsynclogbegin(), kv.second.unsynclogend()}; } if (stateTransferIndices_.empty()) { // No need to do state transfer for unsynced logs // Directly go to new view EnterNewView(); return; } // After this state transfer is completed, this replica will enter the new // view stateTransferCallback_ = std::bind(&Replica::MergeUnSyncedLog, this); stateTransferTerminateTime_ = GetMicrosecondTimestamp() + stateTransferTimeout_ * 1000; stateTransferTerminateCallback_ = std::bind(&Replica::RollbackToViewChange, this); masterContext_->endPoint_->RegisterTimer(stateTransferTimer_); } void Replica::MergeUnSyncedLog() { int f = replicaNum_ / 2; int quorum = (f % 2 == 0) ? (f / 2 + 1) : (f / 2 + 2); SHA_HASH dummy; for (auto& kv : requestsToMerge_) { uint64_t reqKey = kv.first.second; LogEntry* entry = kv.second.first; int count = kv.second.second; if (count >= quorum) { if (syncedLogEntryByReqKey_.get(reqKey)) { // at-most once delete entry; continue; } ProcessRequest(entry, true, false, true); syncedLogEntryByReqKey_.assign(reqKey, entry); syncedLogEntryByLogId_.assign(entry->logId, entry); } } requestsToMerge_.clear(); EnterNewView(); } void Replica::EnterNewView() { LOG(INFO) << "Enter New View " << viewId_ << " maxSyncedLog =" << maxSyncedLogEntry_.load()->logId << "\t" << GetMicrosecondTimestamp(); // Leader sends StartView to all the others if (AmLeader()) { SendStartView(-1); } // Else: followers directly start status_ = ReplicaStatus::NORMAL; lastNormalView_.store(viewId_); // Update crashVector, all synced with master CrashVectorStruct* masterCV = crashVectorInUse_[0].load(); for (uint32_t i = 1; i < crashVectorVecSize_; i++) { crashVectorInUse_[i] = masterCV; } crashVector_.assign(masterCV->version_, masterCV); // More lightweight than CreateContext ResetContext(); // Notify the blocking workers until all workers become active while (activeWorkerNum_ < totalWorkerNum_) { waitVar_.notify_all(); usleep(1000); } LOG(INFO) << "View=" << viewId_ << " Recovered worker number:" << activeWorkerNum_; } void Replica::SendStateTransferRequest() { if (GetMicrosecondTimestamp() >= stateTransferTerminateTime_) { // If statetransfer cannot be completed within a certain amount of time, // rollback to view change masterContext_->endPoint_->UnRegisterTimer(stateTransferTimer_); LOG(INFO) << "The state transfer takes too long, roll back to previous step "; stateTransferTerminateCallback_(); return; } StateTransferRequest request; request.set_view(viewId_); request.set_issynced(transferSyncedEntry_); request.set_replicaid(replicaId_); for (auto& stateTransferInfo : stateTransferIndices_) { // Do not request too many entries at one time, otherwise, UDP packet // cannot handle that uint32_t targetReplica = stateTransferInfo.first; uint32_t logBegin = stateTransferInfo.second.first; uint32_t logEnd = stateTransferInfo.second.second; request.set_logbegin(logBegin); if (logBegin + requestTrasnferBatch_ <= logEnd) { request.set_logend(logBegin + requestTrasnferBatch_); } else { request.set_logend(logEnd); } VLOG(3) << "I am asking stateTransferRequest from " << targetReplica << "\t" << request.logbegin() << "\t" << request.logend() << "\t" << "\tisSynced=" << request.issynced(); masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[targetReplica]), request, MessageType::STATE_TRANSFER_REQUEST); } } void Replica::ProcessStateTransferRequest( const StateTransferRequest& stateTransferRequest) { VLOG(3) << "stateTransferRequest from Replica-" << stateTransferRequest.replicaid() << "\t||" << stateTransferRequest.logbegin() << "\t" << stateTransferRequest.logend() << "\tisSynced " << stateTransferRequest.issynced() << " view=" << stateTransferRequest.view(); if (stateTransferRequest.view() != viewId_) { if (stateTransferRequest.view() > viewId_) { VLOG(2) << "InitiateViewChange-5"; InitiateViewChange(stateTransferRequest.view()); } return; } StateTransferReply reply; CrashVectorStruct* cv = crashVectorInUse_[0].load(); const Address* requesterAddr = masterReceiver_[stateTransferRequest.replicaid()]; reply.set_replicaid(replicaId_); reply.set_view(viewId_); reply.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end()); reply.set_issynced(stateTransferRequest.issynced()); if (reply.issynced()) { reply.set_logbegin(stateTransferRequest.logbegin()); ASSERT(maxSyncedLogEntry_.load()->logId >= stateTransferRequest.logend()); for (uint32_t j = stateTransferRequest.logbegin(); j <= stateTransferRequest.logend(); j++) { LogEntry* entry = syncedLogEntryByLogId_.get(j); if (entry) { RequestBodyToMessage(entry->body, reply.add_reqs()); reply.set_logend(j); } else { LOG(WARNING) << "Maybe just due to lag " << stateTransferRequest.logend() << ">" << reply.logend(); break; } } VLOG(3) << "State Reply " << reply.logbegin() << "--" << reply.logend(); } else { reply.set_logbegin(stateTransferRequest.logbegin()); reply.set_logend(stateTransferRequest.logend()); ASSERT(filteredUnSyncedEntries_.size() > reply.logend()); for (uint32_t j = reply.logbegin(); j <= reply.logend(); j++) { LogEntry* entry = filteredUnSyncedEntries_[j]; ASSERT(entry != NULL); RequestBodyToMessage(entry->body, reply.add_reqs()); } VLOG(3) << "Give " << reply.logbegin() << "-" << reply.logend(); } if (reply.reqs_size() > 0) { masterContext_->endPoint_->SendMsgTo(*requesterAddr, reply, MessageType::STATE_TRANSFER_REPLY); } } void Replica::ProcessStateTransferReply( const StateTransferReply& stateTransferReply) { VLOG(3) << "Receive some state " << stateTransferReply.logbegin() << "--" << stateTransferReply.logend() << " view=" << stateTransferReply.view() << "--- " << transferSyncedEntry_ << "==" << stateTransferReply.issynced(); if (status_ == ReplicaStatus::NORMAL) { // Normal replicas do not need state transfer return; } if (!CheckCV(stateTransferReply.replicaid(), stateTransferReply.cv())) { return; } else { Aggregated(stateTransferReply.cv()); } if (!(masterContext_->endPoint_->isTimerRegistered(stateTransferTimer_))) { // We are not doing state transfer, so ignore this message return; } if (stateTransferReply.view() < viewId_) { // Old view: ignore return; } else if (stateTransferReply.view() > viewId_) { masterContext_->endPoint_->UnRegisterTimer(stateTransferTimer_); if (status_ == ReplicaStatus::RECOVERING) { // This state transfer is useless, stop it and restart recovery request masterContext_->endPoint_->RegisterTimer(recoveryRequestTimer_); } else if (status_ == ReplicaStatus::VIEWCHANGE) { VLOG(2) << "InitiateViewChange-6"; InitiateViewChange(stateTransferReply.view()); } else { LOG(ERROR) << "Unknown replica status " << (uint32_t)status_; } return; } // Else: Same view if (transferSyncedEntry_ != stateTransferReply.issynced()) { return; } const auto& iter = stateTransferIndices_.find(stateTransferReply.replicaid()); if (iter == stateTransferIndices_.end() || stateTransferReply.logend() < iter->second.first) { // We do not need these log entries return; } // So long as the state transfer is making progress, we should give it more // time instead of early termination // Only if the state transfer has not made progress within // stateTransferTimeout_. then we terminate it and rollback to some previous // function stateTransferTerminateTime_ = GetMicrosecondTimestamp() + +stateTransferTimeout_ * 1000; SHA_HASH dummy; if (stateTransferReply.issynced()) { // This is the state-transfer for synced requests for (uint32_t i = iter->second.first; i <= stateTransferReply.logend(); i++) { const RequestBodyMsg& rbMsg = stateTransferReply.reqs(i - iter->second.first); LogEntry* entry = new LogEntry( rbMsg.deadline(), rbMsg.reqkey(), rbMsg.key(), rbMsg.proxyid(), rbMsg.command(), rbMsg.iswrite(), dummy, dummy); ProcessRequest(entry, true, false, false); // LOG(INFO) << "Processed " << entry->logId << "\t" // << maxSyncedLogEntry_.load()->logId; // Register if (syncedLogEntryByReqKey_.get(entry->body.reqKey) == NULL) { syncedLogEntryByReqKey_.assign(entry->body.reqKey, entry); syncedLogEntryByLogId_.assign(entry->logId, entry); if (entry->logId > CONCURRENT_MAP_START_INDEX) { ASSERT(syncedLogEntryByLogId_.get(entry->logId - 1) != NULL); ASSERT(syncedLogEntryByLogId_.get(entry->logId - 1) == entry->prev); } } } } else { // This is the state-transfer for unsynced request (log merge) for (int i = 0; i < stateTransferReply.reqs_size(); i++) { const RequestBodyMsg& rbMsg = stateTransferReply.reqs(i); std::pair key(rbMsg.deadline(), rbMsg.reqkey()); if (requestsToMerge_.find(key) != requestsToMerge_.end()) { LogEntry* entry = new LogEntry( rbMsg.deadline(), rbMsg.reqkey(), rbMsg.key(), rbMsg.proxyid(), rbMsg.command(), rbMsg.iswrite(), dummy, dummy); requestsToMerge_[key] = {entry, 1}; } else { requestsToMerge_[key].second++; } } } iter->second.first = stateTransferReply.logend() + 1; VLOG(2) << "Transfer Synced? " << stateTransferReply.issynced() << "\t" << " In Progress: " << iter->first << ":" << iter->second.first << "-" << iter->second.second; uint32_t remainingPercent = stateTransferIndicesRef_[stateTransferReply.replicaid()].second; if (remainingPercent > 10) { uint32_t previousGap = stateTransferIndicesRef_[stateTransferReply.replicaid()].first; uint32_t remainingGap = iter->second.second - iter->second.first; if (remainingGap * 100 / previousGap < remainingPercent) { LOG(INFO) << "State Tranfer from Replica " << stateTransferReply.replicaid() << "\t" << remainingPercent << "\% of progress (i.e., " << remainingGap << " logs) remaining\t" << "Current committedLogId_=" << committedLogId_ << "\tmaxSyncedLogId=" << maxSyncedLogEntry_.load()->logId; ; stateTransferIndicesRef_[stateTransferReply.replicaid()].second -= 10; } } if (iter->second.first > iter->second.second) { // We have completed the state transfer for this target replica stateTransferIndices_.erase(iter->first); } if (stateTransferIndices_.empty()) { // This state transfer is completed, unregister the timer masterContext_->endPoint_->UnRegisterTimer(stateTransferTimer_); stateTransferIndices_.clear(); stateTransferIndicesRef_.clear(); // If we have a callback, then call it if (stateTransferCallback_) { stateTransferCallback_(); } } } void Replica::RewindSyncedLogTo(uint32_t rewindPoint) { LOG(INFO) << "maxSyncedLogId_=" << maxSyncedLogEntry_.load()->logId << "\t" << "rewindPoint=" << rewindPoint; LogEntry* entryStart = maxSyncedLogEntry_; while (entryStart->logId > rewindPoint) { LogEntry* entryToDel = entryStart; if (entryToDel->prevNonCommutative) { entryToDel->prevNonCommutative->nextNonCommutative = NULL; } if (entryToDel->prev) { entryToDel->prev->next = NULL; } ASSERT(entryStart->prev != NULL); syncedLogEntryByReqKey_.erase(entryToDel->body.reqKey); syncedLogEntryByLogId_.erase(entryToDel->logId); entryStart = entryStart->prev; delete entryToDel; } entryStart->next = NULL; entryStart->nextNonCommutative = NULL; maxSyncedLogEntry_ = entryStart; trackedEntry_.assign(trackedEntry_.size(), maxSyncedLogEntry_); } void Replica::ProcessStartView(const StartView& startView) { VLOG(3) << startView.DebugString(); if (!CheckCV(startView.replicaid(), startView.cv())) { return; } else { Aggregated(startView.cv()); } if (status_ == ReplicaStatus::VIEWCHANGE) { if (startView.view() > viewId_) { VLOG(2) << "InitiateViewChange-7"; InitiateViewChange(startView.view()); } else if (startView.view() == viewId_) { if (committedLogId_ < startView.syncedlogid()) { // Start StateTransfer if (masterContext_->endPoint_->isTimerRegistered(stateTransferTimer_)) { // LOG(INFO) << "StateTransfer In Progress:" // << stateTransferIndices_[startView.replicaid()].first // << "--" // << stateTransferIndices_[startView.replicaid()].second; return; } RewindSyncedLogTo(committedLogId_); stateTransferIndices_.clear(); stateTransferIndicesRef_[startView.replicaid()] = {committedLogId_ + 1, 100}; stateTransferIndices_[startView.replicaid()] = { committedLogId_ + 1, startView.syncedlogid()}; stateTransferIndicesRef_[startView.replicaid()] = {committedLogId_ + 1, 100}; stateTransferCallback_ = std::bind(&Replica::EnterNewView, this); stateTransferTerminateTime_ = GetMicrosecondTimestamp() + stateTransferTimeout_ * 1000; stateTransferTerminateCallback_ = std::bind(&Replica::RollbackToViewChange, this); transferSyncedEntry_ = true; masterContext_->endPoint_->RegisterTimer(stateTransferTimer_); } else { RewindSyncedLogTo(committedLogId_); EnterNewView(); } } // else: startView.view() viewId_) { VLOG(2) << "InitiateViewChange-8"; InitiateViewChange(startView.view()); } else if (startView.view() < viewId_) { // My view is fresher SendStartView(startView.replicaid()); } // Else: We are in the same view and this replica is normal, no need // startView } // If status == RECOVERING, it does not participate in view change } void Replica::BroadcastCrashVectorRequest() { CrashVectorRequest request; boost::uuids::random_generator generator; boost::uuids::uuid uuid = generator(); nonce_ = boost::uuids::to_string(uuid); request.set_nonce(nonce_); request.set_replicaid(replicaId_); crashVectorReplySet_.clear(); for (uint32_t i = 0; i < replicaNum_; i++) { if (i == replicaId_) { continue; } LOG(INFO) << "Ask CrashVector to Replica " << i; masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[i]), request, MessageType::CRASH_VECTOR_REQUEST); } } void Replica::BroadcastRecoveryRequest() { RecoveryRequest request; CrashVectorStruct* cv = crashVectorInUse_[0].load(); request.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end()); request.set_replicaid(replicaId_); for (uint32_t i = 0; i < replicaNum_; i++) { if (i == replicaId_) { continue; } masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[i]), request, MessageType::RECOVERY_REQUEST); } } void Replica::ProcessCrashVectorRequest(const CrashVectorRequest& request) { if (status_ != ReplicaStatus::NORMAL) { return; } CrashVectorReply reply; reply.set_nonce(request.nonce()); reply.set_replicaid(replicaId_); CrashVectorStruct* cv = crashVectorInUse_[0].load(); reply.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end()); masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[request.replicaid()]), reply, MessageType::CRASH_VECTOR_REPLY); } void Replica::ProcessCrashVectorReply(const CrashVectorReply& reply) { if (status_ != ReplicaStatus::RECOVERING) { LOG(INFO) << "nolong Recovering " << status_; return; } if (nonce_ != reply.nonce()) { LOG(INFO) << "nonce inconistent " << crashVectorReplySet_.size(); return; } if (masterContext_->endPoint_->isTimerRegistered(crashVectorRequestTimer_) == false) { // We no longer request crash vectors LOG(INFO) << "no longer register crashVectorRequest " << crashVectorReplySet_.size(); return; } crashVectorReplySet_[reply.replicaid()] = reply; if (crashVectorReplySet_.size() >= replicaNum_ / 2 + 1) { // Got enough quorum CrashVectorStruct* oldCV = crashVectorInUse_[0].load(); CrashVectorStruct* newCV = new CrashVectorStruct(*oldCV); newCV->version_++; for (const auto& kv : crashVectorReplySet_) { for (uint32_t i = 0; i < replicaNum_; i++) { if (kv.second.cv(i) > newCV->cv_[i]) { newCV->cv_[i] = kv.second.cv(i); } } } // Increment self counter newCV->cv_[replicaId_]++; crashVector_.assign(newCV->version_, newCV); for (uint32_t i = 0; i < crashVectorVecSize_; i++) { crashVectorInUse_[i] = newCV; } masterContext_->endPoint_->UnRegisterTimer(crashVectorRequestTimer_); crashVectorReplySet_.clear(); // Start Recovery Request masterContext_->endPoint_->RegisterTimer(recoveryRequestTimer_); } } void Replica::ProcessRecoveryRequest(const RecoveryRequest& request) { if (status_ != ReplicaStatus::NORMAL) { return; } if (!CheckCV(request.replicaid(), request.cv())) { return; } else { Aggregated(request.cv()); } RecoveryReply reply; CrashVectorStruct* cv = crashVectorInUse_[0].load(); reply.set_replicaid(replicaId_); reply.set_view(viewId_); reply.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end()); reply.set_syncedlogid(maxSyncedLogEntry_.load()->logId); masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[request.replicaid()]), reply, MessageType::RECOVERY_REPLY); } void Replica::ProcessRecoveryReply(const RecoveryReply& reply) { if (!CheckCV(reply.replicaid(), reply.cv())) { return; } else { if (Aggregated(reply.cv())) { // If cv is updated, then it is likely that some messages in // recoveryReplySet_ become stray, so remove them for (uint32_t i = 0; i < replicaNum_; i++) { auto iter = recoveryReplySet_.find(i); if (iter != recoveryReplySet_.end() && (!CheckCV(i, iter->second.cv()))) { recoveryReplySet_.erase(i); } } } } if (masterContext_->endPoint_->isTimerRegistered(recoveryRequestTimer_) == false) { // We no longer request recovery reply return; } recoveryReplySet_[reply.replicaid()] = reply; if (recoveryReplySet_.size() >= replicaNum_ / 2 + 1) { // Got enough quorum masterContext_->endPoint_->UnRegisterTimer(recoveryRequestTimer_); uint32_t maxView = 0; uint32_t syncedLogId = 0; for (const auto& kv : recoveryReplySet_) { if (kv.second.view() > maxView) { maxView = kv.second.view(); syncedLogId = kv.second.syncedlogid(); } } // Get the maxView, launch state transfer with the corresponding leader viewId_ = maxView; recoveryReplySet_.clear(); LOG(INFO) << "Replica intends to enter View " << viewId_ << " after recovery; the number of logs to recover is:" << syncedLogId; if (AmLeader()) { LOG(INFO) << "The recovered replica will become the leader in this view, " "skip it!"; // If the recoverying replica happens to be the leader of the new view, // don't participate. Wait until the healthy replicas elect a new leader usleep(1000); // sleep some time and restart the recovery process masterContext_->endPoint_->RegisterTimer(recoveryRequestTimer_); } else { // Launch state transfer for synced log entries stateTransferIndices_.clear(); if (syncedLogId >= CONCURRENT_MAP_START_INDEX) { // There are some synced log entries that should be transferred transferSyncedEntry_ = true; stateTransferIndices_[maxView % replicaNum_] = { CONCURRENT_MAP_START_INDEX, syncedLogId}; stateTransferIndicesRef_[maxView % replicaNum_] = { syncedLogId - CONCURRENT_MAP_START_INDEX + 1, 100}; LOG(INFO) << "Recover Logs from " << CONCURRENT_MAP_START_INDEX << "\t to\t" << syncedLogId; stateTransferCallback_ = std::bind(&Replica::EnterNewView, this); stateTransferTerminateTime_ = GetMicrosecondTimestamp() + stateTransferTimeout_ * 1000; stateTransferTerminateCallback_ = std::bind(&Replica::RollbackToRecovery, this); masterContext_->endPoint_->RegisterTimer(stateTransferTimer_); } else { // No log entries to recover, directly enter new view EnterNewView(); } } } } void Replica::ProcessSyncStatusReport(const SyncStatusReport& report) { if (!CheckCV(report.replicaid(), report.cv())) { // Stray message return; } else { if (Aggregated(report.cv())) { // Possibly make existing msg become stray for (uint32_t i = 0; i < replicaId_; i++) { auto iter = syncStatusSet_.find(i); if (iter != syncStatusSet_.end() && (!CheckCV(i, iter->second.cv()))) { syncStatusSet_.erase(i); } } } } if (!CheckView(report.view())) { return; } auto iter = syncStatusSet_.find(report.replicaid()); if (iter == syncStatusSet_.end() || iter->second.syncedlogid() < report.syncedlogid()) { syncStatusSet_[report.replicaid()] = report; } // LOG(INFO) << "sync size=" << syncStatusSet_.size(); if (syncStatusSet_.size() >= replicaNum_ / 2 + 1) { uint32_t minLogId = UINT32_MAX; for (const auto& kv : syncStatusSet_) { if (minLogId > kv.second.syncedlogid()) { minLogId = kv.second.syncedlogid(); } } // LOG(INFO) << "minLogId=" << minLogId << "\t" << committedLogId_; if (minLogId >= committedLogId_) { committedLogId_ = minLogId; // LOG(INFO) << "syncStauts " << report.DebugString(); SendCommit(); } } } void Replica::ProcessCommitInstruction(const CommitInstruction& commit) { if (!CheckCV(commit.replicaid(), commit.cv())) { return; } else { Aggregated(commit.cv()); } if (!CheckView(commit.view())) { return; } lastHeartBeatTime_ = GetMicrosecondTimestamp(); // LOG(INFO) << "commit " << commit.DebugString(); // Buggy: should compare with syncedLogId, to see whether log is missing if (commit.committedlogid() > committedLogId_) { // Don't assign committedLogId_ directly, because this replica may have // not get enough synced logs toCommitLogId_ = commit.committedlogid(); // LOG(INFO) << "committedLogId_=" << committedLogId_; } uint32_t nextCommitId = maxSyncedLogEntry_.load()->logId; if (toCommitLogId_ < nextCommitId) { nextCommitId = toCommitLogId_; } while (committedLogId_ < nextCommitId) { if (committedLogId_ < CONCURRENT_MAP_START_INDEX) { committedLogId_++; continue; } uint32_t preFetchTrackedLogId = trackedEntry_[0]->logId; LogEntry* entry = syncedLogEntryByLogId_.get(committedLogId_); if (entry == NULL) { if (committedLogId_ <= preFetchTrackedLogId) { LOG(INFO) << "committedLogId_=" << committedLogId_ << "\t" << "maxSyncedLogId_=" << maxSyncedLogEntry_.load()->logId << "\ttrackedLogId =" << preFetchTrackedLogId; for (uint32_t i = CONCURRENT_MAP_START_INDEX; i <= trackedEntry_[0]->logId; i++) { if (syncedLogEntryByLogId_.get(i) == NULL) { LOG(INFO) << "log " << i << " not recorded"; } } LOG(ERROR) << "abnormal exit"; exit(0); } if (viewId_ == 1) { LOG(INFO) << "committedLogId_=" << committedLogId_ << "\t" << "maxSyncedLogId_=" << maxSyncedLogEntry_.load()->logId << "\t" << "\ttrackedLogId =" << trackedEntry_[0]->logId; } break; } ASSERT(entry != NULL); entry->result = ApplicationExecute(entry->body); committedLogId_++; // if (committedLogId_ % 1000 == 0) { // LOG(INFO) << "committedLogId_=" << committedLogId_ << "\t" // << "maxSyncedLogId_=" << maxSyncedLogEntry_.load()->logId; // } } } bool Replica::CheckView(const uint32_t view, const bool isMaster) { if (view < viewId_) { // old message return false; } if (view > viewId_) { if (isMaster) { if (status_ != ReplicaStatus::RECOVERING) { // Recovering replicas do not participate in view change VLOG(2) << "InitiateViewChange-9: " << view << "\t currentView=" << viewId_ << "\t" << "td=" << pthread_self(); InitiateViewChange(view); } } else { // new view, update status and wait for master thread to handle the // situation status_ = ReplicaStatus::VIEWCHANGE; } return false; } return true; } bool Replica::CheckCV(const uint32_t senderId, const google::protobuf::RepeatedField& cv) { CrashVectorStruct* masterCV = crashVectorInUse_[0].load(); return (cv.at(senderId) >= masterCV->cv_[senderId]); } bool Replica::Aggregated(const google::protobuf::RepeatedField& cv) { CrashVectorStruct* masterCV = crashVectorInUse_[0].load(); std::vector maxCV(masterCV->cv_); bool needAggregate = false; for (uint32_t i = 0; i < replicaNum_; i++) { if (maxCV[i] < cv.at(i)) { // The incoming cv has fresher elements needAggregate = true; maxCV[i] = cv.at(i); } } if (needAggregate) { CrashVectorStruct* newCV = new CrashVectorStruct(maxCV, masterCV->version_ + 1); crashVector_.assign(newCV->version_, newCV); crashVectorInUse_[0] = newCV; if (status_ == ReplicaStatus::NORMAL) { // Wait until the reply threads has known the new cv while (true) { bool ready = true; for (uint32_t i = 1; i <= fastReplyQu_.size(); i++) { if (crashVectorInUse_[i].load()->version_ < newCV->version_) { ready = false; } } if (ready) { break; } else { usleep(1000); } } } // Else (status_=ViewChange), then there is only master thread alive, // no need to wait for reply thread } return needAggregate; } void Replica::RollbackToViewChange() { LOG(INFO) << "Rollback to restart view change"; status_ = ReplicaStatus::VIEWCHANGE; viewChangeSet_.clear(); if (false == masterContext_->endPoint_->isTimerRegistered(viewChangeTimer_)) { masterContext_->endPoint_->RegisterTimer(viewChangeTimer_); } } void Replica::RollbackToRecovery() { LOG(INFO) << "Rollback to restart recovery"; status_ = ReplicaStatus::RECOVERING; recoveryReplySet_.clear(); // Since we start a new round of recovery, the logs obtained from the // previous round (if any) will not count. Delete them (=clean state) and // restart LogEntry* entryStart = syncedLogEntryHead_->next; while (entryStart) { LogEntry* entryToDel = entryStart; entryStart = entryStart->next; delete entryToDel; } maxSyncedLogEntry_ = syncedLogEntryHead_; maxSyncedLogEntryByKey_.assign(keyNum_, NULL); if (false == masterContext_->endPoint_->isTimerRegistered(recoveryRequestTimer_)) { masterContext_->endPoint_->RegisterTimer(recoveryRequestTimer_); } } std::string Replica::ApplicationExecute(const RequestBody& request) { return ""; } bool Replica::AmLeader() { return (viewId_ % replicaNum_ == replicaId_); } } // namespace nezha ================================================ FILE: replica/replica.h ================================================ #ifndef NEZHA_REPLICA_H #define NEZHA_REPLICA_H #include #include #include #include #include #include #include "lib/utils.h" #include "proto/nezha_proto.pb.h" #include "replica_config.h" namespace nezha { using namespace nezha::proto; /** Receiver is more complex than sender. A sender only needs an endpoint. * But A Receiver needs an endpoint (endPoint_) to receive messages, and the * message should be handled bu an already-registered handler (msgHandlerFunc_). * Besides, in order to unblock the endpoint during view change, there is also a * timer (monitorTimer_) needed, to keep monitor the status of the replica. * * We package all the necessary components into ReceiverContext for brievity */ struct ReceiverContext { Endpoint* endPoint_; void* context_; MessageHandlerFunc msgHandlerFunc_; Timer* monitorTimer_; ReceiverContext(Endpoint* ep = NULL, void* ctx = NULL, MessageHandlerFunc msgFunc = nullptr, Timer* t = NULL) : endPoint_(ep), context_(ctx), msgHandlerFunc_(msgFunc), monitorTimer_(t) {} void Register(int endpointType = EndpointType::UDP_ENDPOINT) { if (endpointType == EndpointType::UDP_ENDPOINT) { // UDP Endpoint UDPMsgHandler* udpMsgHandler = new UDPMsgHandler(msgHandlerFunc_, context_); ((UDPSocketEndpoint*)endPoint_)->RegisterMsgHandler(udpMsgHandler); ((UDPSocketEndpoint*)endPoint_)->RegisterTimer(monitorTimer_); } else { // To support other types of endpoints later LOG(ERROR) << "unknown endpoint type " << (int)endpointType; } } }; /** * Refer to replica_run.cc, the runnable program only needs to instantiate a * Replica object with a configuration file. Then it calls Run() method to run * and calls Terminate() method to stop */ class Replica { private: /** All the configuration parameters for the replica are included in * replicaConfig_*/ ReplicaConfig replicaConfig_; /** 1 for UDP, 2 for GRPC (not supported yet) */ int endPointType_; /** viewId_ starts from 0 */ std::atomic viewId_; std::atomic lastNormalView_; /** replicaId_ starts from 0 */ std::atomic replicaId_; std::atomic replicaNum_; /** Worker threads check status_ to decide whether they should be blocked (for * view change) */ std::atomic status_; /** Every unique request, sharded across several maps for concurrency. * Before a request is processed, it is addded to one of these maps by * recordThread. Map from reqKey -> logEntry */ std::vector> recordMap_; /** TrackThread traverses the synced log list and record in * syncedLogEntryByReqKey_ and syncedLogEntryByLogId_ */ std::vector trackedEntry_; /** earlyBuffer_ uses the pair as key. std::map will sort * them in ascending order by default */ std::map, LogEntry*> earlyBuffer_; /** lastReleasedEntryByKeys_ is used to support communativity, we record the * last relased entry for each key. When new requests come, it compares with * the last released entry in the same key */ std::vector> lastReleasedEntryByKeys_; /** keyNum_ indicates the number of keys that requests will work on (to * support commutativity optimization). We assume one request will only work * on one key */ uint32_t keyNum_; /** * Log entries are organized as a list. * On the leader, it only needs to maintain one list, i.e., synced log list; * But on the follower, it maintains two lists, i.e., unsynced log list and * synced log list. * * syncedLogEntryHead_/unSyncedLogEntryHead_ are the starting point of the two * lists, which we crearte a dummy node for each list to serve as the head * * maxSyncedLogEntry_ and maxUnSyncedLogEntry_ are the tails of the two lists * respectively */ LogEntry* syncedLogEntryHead_; LogEntry* unSyncedLogEntryHead_; std::atomic maxSyncedLogEntry_; std::atomic maxUnSyncedLogEntry_; /** * minUnSyncedLogEntry_ is initialized as unSyncedLogEntryHead_, but our * garbage-collection thread can advance it (TODO). In this way, it can * accelerate the generation of filteredUnSyncedEntries_ */ LogEntry* minUnSyncedLogEntry_; /** * These three vecs can be cosnidered as finer-grained version of * maxSyncedLogEntry_,maxUnSyncedLogEntry_ and minUnSyncedLogEntry_. * They are mainly used to support commutativity optimization. * * maxSyncedLogEntryByKey_ and minUnSyncedLogEntryByKey_ combine to work as * the sync-point, as illustrated in Figure 5 of our paper. */ std::vector maxSyncedLogEntryByKey_; std::vector maxUnSyncedLogEntryByKey_; std::vector minUnSyncedLogEntryByKey_; /** Index Map, facilate for entry look-up */ ConcurrentMap syncedLogEntryByReqKey_; ConcurrentMap syncedLogEntryByLogId_; /** Each thread is given a unique name (key) and stored in the pool */ std::map threadPool_; /** committedLogId_ and toCommitLogId_ are used for peridical synchronization * (to accelerate failure recovery) */ std::atomic committedLogId_; std::atomic toCommitLogId_; /** Context (including a message handler and a monitor timer) */ ReceiverContext* masterContext_; std::vector requestContext_; ReceiverContext* indexSyncContext_; ReceiverContext* missedIndexAckContext_; ReceiverContext* missedReqAckContext_; /** Timers * * Since message can be dropped after it is sent. For those messages which are * required to be eventually delivered, we register a timer to the endpoint, * which keeps sending the message, until the sender knows it is * delivered and unregister the timer */ Timer* heartbeatCheckTimer_; Timer* indexAskTimer_; Timer* requestAskTimer_; Timer* viewChangeTimer_; Timer* stateTransferTimer_; Timer* periodicSyncTimer_; Timer* crashVectorRequestTimer_; Timer* recoveryRequestTimer_; /** Endpoints * * These endpoints are only used as senders, so they do not need the complex * context struct as receivers */ std::vector indexSender_; // send indices (Sec 5.4) std::vector fastReplySender_; std::vector slowReplySender_; Endpoint* indexRequester_; /** In the slow path, when indices are missing, Follower uses this endpoint to send requests asking for the missing indices */ Endpoint* reqRequester_; /** Follower uses this endpoint to send requests asking for the missed requests */ Endpoint* indexAcker_; /** Leader uses this endpoint to reply the indices to the requested followers */ /** Addresses */ std::vector indexReceiver_; /** Leader will send indices to these addresses (each follower has such an address to receive index) */ std::vector indexAskReceiver_; /** Follower sends ask-requests to these addresses when it is missing some indices */ std::vector requestAskReceiver_; /** Followers send ask-requests to these addresses when it is missing some requests */ std::vector masterReceiver_; /** Each replica maintains a master thread, which sends/receives/processes different types of control messages, therefore, each replica matains such an address vector (size of replicaNum) to know the address of others' master thread */ /* Round robin indices are used to achieve load balance among threads of the * same functionality (e.g., multiple reply threads) */ uint32_t roundRobinProcessIdx_; uint32_t roundRobinIndexAskIdx_; uint32_t roundRobinRequestAskIdx_; /** Version-based CrashVector (version number as the key), to facilitate * garbage-collection */ ConcurrentMap crashVector_; /** Each related thread (i.e. fast reply threads + index recv thread + index * ack thread) will hold an atomic pointer, pointing to the crash vector they * are currently using. * * The garbage collect thread will check crashVectorInUse_ to decide which * CrashVectorStruct can be safely reclaimed. * */ std::atomic* crashVectorInUse_; /** The number of threads using crash vectors (i.e., the length of * crashVectorInUse_) */ uint32_t crashVectorVecSize_; /** The sync messages (for index sync process) which have not been processed */ std::map, IndexSync> pendingIndexSync_; /** Each key in missedReqKeys_ indicating a request is missing on this replica */ std::set missedReqKeys_; /** Each pair indicates a segment of indices is missing during index sync * process */ std::pair missedIndices_; /** The max number of indices/reqKeys/requests that can be carried in one * stateTransfer message */ uint32_t indexTransferBatch_; uint32_t requestKeyTransferBatch_; uint32_t requestTrasnferBatch_; /* State-Transfer related variables **/ uint64_t stateTransferTimeout_; bool transferSyncedEntry_; /** key: the target replica to ask for requests; value: the segment of requests that will be transferred */ std::map> stateTransferIndices_; std::map> stateTransferIndicesRef_; // Only serves as the references std::function stateTransferCallback_; /** The max amount of time that the state transfer can last */ std::uint64_t stateTransferTerminateTime_; /** If the state transfer cannot be completed within * stateTransferTerminateTime_, execute the following callback and terminate * the state transfer */ std::function stateTransferTerminateCallback_; /** Before transfer unsynced logs, the replica needs to first filter all the * unsynced logs, because most of them overlap with synced logs, which has * already been transferred, so we only need to transfer a small portion of * unsynced logs after filtering out those overlapped ones */ std::vector filteredUnSyncedEntries_; /** During leader election, the new leader use requestsToMerge_ to merge logs * collected from the quorum of replicas. * * Key: ; Value: */ std::map, std::pair> requestsToMerge_; // Recovery related variables std::string nonce_; /** Key: replicaId. These structuers are used to check whether a quorum has * been formed */ std::map crashVectorReplySet_; std::map recoveryReplySet_; std::map viewChangeSet_; std::map syncStatusSet_; /** Inserted by ReceiveThread, and looked up by * FastReplyThread/SlowReplyThread */ ConcurrentMap proxyAddressMap_; /** Followers periodically check lastHeartBeatTime_ to decide whether it * should issue view change * * lastHeartBeatTime_ is updated every time the follower receives a heartbeat * message (i.e. IndexSync and CommitInstruction) * */ std::atomic lastHeartBeatTime_; /** Tentative-- TODO: Add more explanation */ uint64_t lastAskMissedIndexTime_; uint64_t lastAskMissedRequestTime_; std::unordered_map askTimebyReqKey_; std::vector fetchTime_; /** Replicas use it to check whether every worker thread has stopped */ std::atomic activeWorkerNum_; /** The total number of worker threads. When terminating, replicas use this * variable to detect whether every thread has been terminated and exited */ uint32_t totalWorkerNum_; /** To implement blocking mechanism, see BlockWhenStatusIsNot function */ std::condition_variable waitVar_; std::mutex waitMutext_; ConcurrentQueue tagQu_; // For Debug, will be deleted /** To communicate between ReceiveThread and ProcessThread */ ConcurrentQueue processQu_; /** To communicate between ReceiveThread and RecordThread */ std::vector> recordQu_; /** To communicate between IndexRecvThread and IndexProcessThread */ ConcurrentQueue> indexQu_; /** To communinicate between ProcessThread and FastReplyThread */ std::vector> fastReplyQu_; /** To communinicate between ProcessThread and SlowReplyThread */ std::vector> slowReplyQu_; /** To communicate between ReceiveThread and OWDCalcThread (Transmit ) */ ConcurrentQueue> owdQu_; /** Record the one-way delay for each proxy. Updated by OWDCalcThread, read by * FastReplyThread/SlowReplyThread */ ConcurrentMap owdMap_; /** To window size used to estimate one-way delay */ uint32_t slidingWindowLen_; double movingPercentile_; std::map> slidingWindow_; // std::map owdSampleNum_; /** Garbage-Collection related variables */ uint32_t reclaimTimeout_; /** The old versions of crash vectors in crashVector_ that can be reclaimed */ uint32_t cvVersionToClear_; /** GarbageCollectThread use prepareToClearLateBufferLogId_ to tell * IndexSyncThread that it intends to clear the requests before this point * [included] */ std::atomic prepareToClearLateBufferLogId_; /** GarbageCollectThread use prepareToClearLateBufferLogId_ to tell * IndexSyncThread and FastReplyThread that it intends to clear the log * entries before this point [included] */ std::atomic prepareToClearUnSyncedLogId_; /** IndexSyncThread use safeToClearLateBufferLogId_ to tell * GarbageCollectThread that it can safely clear the requests in late buffer * up to this point [included] */ std::atomic safeToClearLateBufferLogId_; /** FastReplyThread(s) and IndexSyncThread use these atomic variables to tell * GarbageCollectThread, telling that it can safely clear unsynced log entries * up to this point [included] */ std::atomic* safeToClearUnSyncedLogId_; /** Create/Initialize all the necessary variables, it is only called once * during the lifetime of the replica */ void CreateContext(); /** Launch all the threads, only called once during the lifetime of the * replica */ void LaunchThreads(); /** After a view change or recovery is completed, the replica enters a new * view*/ void EnterNewView(); /** View Change (recovery) related */ /** Reset the necessary variables. It is called every time when we initiate a * view change, and this function is much more lightweight than CreateContext */ void ResetContext(); void InitiateViewChange(const uint32_t view); /** Send ViewChangeRequest to every replica and send ViewChange to the * leader. Used to instantiate viewChangeTimer_*/ void BroadcastViewChange(); /** Send ViewChangeRequest to a specific replica */ void SendViewChangeRequest(const int toReplicaId); /** Send ViewChange to the leader(i.e., whose replicaId = view % replicaNum) */ void SendViewChange(); /** A crashed replica needs to first call InitiateRecovery in order to join * the system */ void InitiateRecovery(); /** The RECOVERING replica asks every healthy replica for crash vector */ void BroadcastCrashVectorRequest(); /** The RECOVERING replica asks every healthy replica for necessary recovery * information (e.g., the current view, the synced logs on that replica) */ void BroadcastRecoveryRequest(); /** The new leader, after fully recovery, send StartView to others */ void SendStartView(const int toReplicaId); /** Replicas use state transfer to retrieve (large number of) log entries from * others. Used to instantiate stateTransferTimer_ */ void SendStateTransferRequest(); /** If the view change process takes too long and cannot be completed (this * can happen when the leader in the new view also fails), the replica will * terminate the current view change process and starts a new view change with * higher viewId */ void RollbackToViewChange(); /** If the recovery process takes too long and cannot be completed (this can * happen when the RECOVERING replica happens to be the leader in the new * view), this replica will terminate the in-progress recovery and starts a * new round of recovery, after the healthy replicas have elected a new leader among themseleves */ void RollbackToRecovery(); /** During view change, replicas may have some uncommitted requests, which * will not show in the new view, so replicas will rewind log list and * eliminate those uncommitted onces, and appended with the committed entries * from the leader */ void RewindSyncedLogTo(uint32_t rewindPoint); /** Periodic Sync related */ /** Followers periodically report their sync-point to the leader, so the * leader can decide the commit-point. * Used to instantiate periodicSyncTimer_ */ void SendSyncStatusReport(); /** Leader send commit-point to followers, so followers can safely execute the log entries up to commit-point. This is very useful to accelerate view change after the leader fails (details in para. ``Acceleration of Recovery'' of Sec 6 of our paper) */ void SendCommit(); /** Garbage-Collect related */ /** If the logs (on the followers) have not been added into synced log list * and has been stayed on the replica for too long, then the garbage-collect * (gc) thread will reclaim it and free its memory */ void ReclaimStaleLogs(); void PrepareNextReclaim(); /** If the crashVectorStruct is no longer used by any thread on this replica, * the gc-thread collects it */ void ReclaimStaleCrashVector(); /** Message handler */ bool ProcessIndexSync(const IndexSync& idxSyncMsg); void ProcessViewChangeReq(const ViewChangeRequest& viewChangeReq); void ProcessViewChange(const ViewChange& viewChange); void ProcessStateTransferRequest( const StateTransferRequest& stateTransferReq); void ProcessStateTransferReply(const StateTransferReply& stateTransferRep); void ProcessStartView(const StartView& startView); void ProcessCrashVectorRequest(const CrashVectorRequest& request); void ProcessCrashVectorReply(const CrashVectorReply& reply); void ProcessRecoveryRequest(const RecoveryRequest& request); void ProcessRecoveryReply(const RecoveryReply& reply); void ProcessSyncStatusReport(const SyncStatusReport& report); void ProcessCommitInstruction(const CommitInstruction& commit); void ProcessRequest(LogEntry* rb, const bool isSyncedReq = true, const bool sendReply = true, const bool canExecute = false); /** The interfaces to bridge specific applications with Nezha */ std::string ApplicationExecute(const RequestBody& request); /** Tools */ /** Check whether this replica is leader, return true if it is */ bool AmLeader(); /** During view change, BlockWhenStatusIsNot uses the conditional variable * (waitVar_) to block the worker threads. Finally only the master thread is * alive, so that it can run the related procedure without risks of data race */ void BlockWhenStatusIsNot(char targetStatus); /** * CheckView returns true if the message's view (Parameter-1) is consistent * with the replica's current view * * Master thread (isMaster) can initiate view change, non-master threads only * switch status to ViewChange */ bool CheckView(const uint32_t view, const bool isMaster = true); /** CheckCV checks the crashVector to decide whether the incoming message is * stray message. It returns true if the cv is valid (i.e., the message is not * stray message) */ bool CheckCV(const uint32_t senderId, const google::protobuf::RepeatedField& cv); /** Check whether the incoming message's crash vector (the passed-in cv) will * lead to the update of replica's crashVector (i.e., crashVector_[0]). * If it needs aggregation, this function will aggreate it and return true */ bool Aggregated(const google::protobuf::RepeatedField& cv); /** * During state transfer, the log transfer are divided into two parts, synced * log transfer and unsynced log transfer. Which are undertaken by the * following two functions */ void TransferSyncedLog(); void TransferUnSyncedLog(); /** * After enough unsynced logs have been collected by the leader, the leader * merges the unsynced logs to deice which logs can be includec in the * newly-built log list (details in Sec 6 and Appendix A.3 of our paper) */ void MergeUnSyncedLog(); /** Convert our self-defined message to proto message */ void RequestBodyToMessage(const RequestBody& rb, RequestBodyMsg* rbMsg); /** Threads * * Functions whose names are ended with ``Thread`` will be instianted with a * thread. Some functions are heavy and needed to be parallelized, so the * parallized threads with the same functionality are distinguished with the * first parameter, id. * * Some functions will also use crash vector, to distinguish the crash vectors * used by them, the functions also accept the second parameter, cvId. */ void ReceiveThread(int id = -1); void ProcessThread(int id = -1); void RecordThread(int id = -1); void TrackThread(int id = -1); void FastReplyThread(int id = -1, int cvId = -1); void SlowReplyThread(int id = -1); void IndexSendThread(int id = -1, int cvId = -1); void IndexRecvThread(); void IndexProcessThread(); void MissedIndexAckThread(); void MissedReqAckThread(); void OWDCalcThread(); void GarbageCollectThread(); /** Message handler functions * These message handler functions will be used to instantiate MessageHandlers * and attached to their related endpoints. */ void ReceiveClientRequest(MessageHeader* msgHdr, char* msgBuffer, Address* sender); void ReceiveIndexSyncMessage(MessageHeader* msgHdr, char* msgBuffer); void ReceiveAskMissedReq(MessageHeader* msgHdr, char* msgBuffer); void ReceiveAskMissedIdx(MessageHeader* msgHdr, char* msgBuffer); void ReceiveMasterMessage(MessageHeader* msgHdr, char* msgBuffer); /** Used to instantiate indexAskTimer_ */ void AskMissedIndex(); /** Used to instantiate requestAskTimer_*/ void AskMissedRequest(); /** Used to instantiate heartbeatCheckTimer_ */ void CheckHeartBeat(); public: /** Replica accepts a config file, which contains all the necessary * information to instantiate the object, then it can call Run method * * Specifically, if this replica has crashed before, it will recieve * isRecovering=true, then it first completes the recovery procedure before it * can join the system * */ Replica( const std::string& configFile = "../configs/nezha-replica-config.yaml", bool isRecovering = false); ~Replica(); void Run(); void Terminate(); /** Tentative */ std::atomic* repliedSyncPoint_; uint32_t maxProxyNum_ = 16; }; } // namespace nezha #endif ================================================ FILE: replica/replica_config.h ================================================ #include #include #include #include #include struct ReplicaConfig { uint32_t endpointType; std::vector replicaIps; int replicaId; int receiverShards; int recordShards; int replyShards; int trackShards; int receiverPort; int indexSyncPort; int requestAskPort; int indexAskPort; int masterPort; int monitorPeriodMs; int heartbeatThresholdMs; int indexAskPeriodMs; int viewChangePeriodMs; int stateTransferPeriodMs; int stateTransferTimeoutMs; int indexTransferBatch; int requestKeyTransferBatch; int requestTransferBatch; int requestAskPeriodMs; int crashVectorRequestPeriodMs; int recoveryRequestPeriodMs; int syncReportPeriodMs; int indexSyncPeriodUs; double movingPercentile; int keyNum; uint32_t owdEstimationWindow; uint32_t reclaimTimeoutMs; int indexSyncShards; // The number of threads to process requests. For now process-shards // is fixed to 1, because the early-buffer enque/deque is hard to // parallelize. Maybe later we can find a high-performant **concurrent // priority queue** for early-buffer, then process-shards may be // parallelized for higher performance int processShards = 1; // Parses yaml file configFilename and fills in fields of ReplicaConfig // accordingly. Returns an error message or "" if there are no errors. std::string parseConfig(std::string configFilename) { YAML::Node config; try { config = YAML::LoadFile(configFilename); } catch (const YAML::BadFile& e) { return "Error loading config file:" + e.msg + "."; } LOG(INFO) << "Using config:\n " << config; std::string key; // Keep track of current key for better error messages try { key = "endpoint-type"; endpointType = config[key].as(); key = "replica-ips"; for (uint32_t i = 0; i < config[key].size(); i++) { replicaIps.push_back(config[key][i].as()); } key = "replica-id"; replicaId = config[key].as(); key = "receiver-shards"; receiverShards = config[key].as(); key = "record-shards"; recordShards = config[key].as(); key = "reply-shards"; replyShards = config[key].as(); key = "index-sync-shards"; indexSyncShards = config[key].as(); key = "track-shards"; trackShards = config[key].as(); key = "receiver-port"; receiverPort = config[key].as(); key = "index-sync-port"; indexSyncPort = config[key].as(); key = "request-ask-port"; requestAskPort = config[key].as(); key = "index-ask-port"; indexAskPort = config[key].as(); key = "master-port"; masterPort = config[key].as(); key = "monitor-period-ms"; monitorPeriodMs = config[key].as(); key = "heartbeat-threshold-ms"; heartbeatThresholdMs = config[key].as(); key = "index-ask-period-ms"; indexAskPeriodMs = config[key].as(); key = "view-change-period-ms"; viewChangePeriodMs = config[key].as(); key = "request-ask-period-ms"; requestAskPeriodMs = config[key].as(); key = "state-transfer-period-ms"; stateTransferPeriodMs = config[key].as(); key = "state-transfer-timeout-ms"; stateTransferTimeoutMs = config[key].as(); key = "index-transfer-batch"; indexTransferBatch = config[key].as(); key = "request-key-transfer-batch"; requestKeyTransferBatch = config[key].as(); key = "request-transfer-batch"; requestTransferBatch = config[key].as(); key = "crash-vector-request-period-ms"; crashVectorRequestPeriodMs = config[key].as(); key = "recovery-request-period-ms"; recoveryRequestPeriodMs = config[key].as(); key = "sync-report-period-ms"; syncReportPeriodMs = config[key].as(); key = "key-num"; keyNum = config[key].as(); key = "owd-estimation-window"; owdEstimationWindow = config[key].as(); key = "index-sync-period-us"; indexSyncPeriodUs = config[key].as(); key = "moving-percentile"; movingPercentile = config[key].as(); key = "reclaim-timeout-ms"; reclaimTimeoutMs = config[key].as(); return ""; } catch (const YAML::BadConversion& e) { if (config[key]) { return "Error parsing config field " + key + ": " + e.msg + "."; } else { return "Error parsing config field " + key + ": " + key + " not found."; } } catch (const std::exception& e) { return "Error parsing config field " + key + ": " + e.what() + "."; } } }; ================================================ FILE: replica/replica_run.cc ================================================ #include "replica/replica.h" DEFINE_string(config, "nezhav2/config/nezha-replica-config-0.yaml", "The config file for the replica"); DEFINE_bool(isRecovering, false, "If this flag is true, then the replica will start recovery process once it is launched"); nezha::Replica* replica = NULL; void Terminate(int para) { LOG(INFO) << "Terminating..."; replica->Terminate(); } int main(int argc, char* argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, true); google::InitGoogleLogging(argv[0]); FLAGS_logtostderr = 1; signal(SIGINT, Terminate); replica = new nezha::Replica(FLAGS_config, FLAGS_isRecovering); replica->Run(); LOG(INFO) << " Run Completed"; delete replica; return 0; } ================================================ FILE: scripts/analysis.py ================================================ import pandas as pd from IPython import embed; import argparse import datetime LOGIN_PATH = "/home/steam1994" FAST_REPLY = 6 SLOW_REPLY = 7 COMMIT_REPLY = 8 def throughput_apply_func(group): if len(group): return pd.Series({ 'AvgThroughput':len(group), }) def ThroughputAnalysis(merge_df): merge_df.loc[:, "time"] = merge_df['CommitTime'].apply( lambda us_ts: datetime.datetime.fromtimestamp(us_ts * 1e-6)) bin_interval_s = 1 grouped = merge_df.groupby( pd.Grouper(key='time', freq='{}s'.format(bin_interval_s))) grouped_apply_orders = grouped.apply(throughput_apply_func) grouped_apply_orders = grouped_apply_orders.dropna() grouped_apply_orders = grouped_apply_orders[5:-5] # print(grouped_apply_orders['AvgThroughput']) throughput = (grouped_apply_orders['AvgThroughput']/bin_interval_s).mean() return throughput if __name__ == '__main__': parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('--num_replicas', type=int, default = 3, help='Specify the number of replicas ') parser.add_argument('--num_proxies', type=int, default = 2, help='Specify the number of proxies ') parser.add_argument('--num_clients', type=int, default = 10, help='Specify the number of clients ') args = parser.parse_args() num_replicas = args.num_replicas num_proxies = args.num_proxies num_clients = args.num_clients print("replicas: ", num_replicas) print("proxies: ", num_proxies) print("clients: ", num_clients) folder_name = "stats" stats_folder = "{login_path}/{folder_name}".format( login_path = LOGIN_PATH, folder_name = folder_name ) client_df_list = [] for i in range(num_clients): file_name = "Client-Stats-"+str(i+1) client_df = pd.read_csv(stats_folder+"/"+file_name) client_df_list.append(client_df) client_df = pd.concat(client_df_list) client_df['Latency'] = client_df['CommitTime']-client_df['SendTime'] stats = "" stats += "Num:"+str(len(client_df))+"\n" stats += "50p:\t"+str(client_df['Latency'].quantile(.5))+"\n" stats += "75p:\t"+str(client_df['Latency'].quantile(.75))+"\n" stats += "90p:\t"+str(client_df['Latency'].quantile(.9))+"\n" fast_num = len(client_df[client_df['CommitType']== FAST_REPLY]) stats += "Fast:\t"+str(fast_num/ len(client_df))+"\n" print(stats) throughput_stats = ThroughputAnalysis(client_df) print("Throughput ", throughput_stats) proxy_df_list = [] for i in range(num_proxies): file_name = "Proxy-Stats-"+str(i+1)+".csv" proxy_df = pd.read_csv(stats_folder+"/"+file_name) proxy_df_list.append(proxy_df) print("Proxy ", len(proxy_df)) proxy_df = pd.concat(proxy_df_list) proxy_df = proxy_df.sort_values(by=['ClientTime']) proxy_df["E2E"] = proxy_df["ProxyRecvTime"]-proxy_df["ProxyTime"] proxy_df["Bound"] = proxy_df["Deadline"]-proxy_df["ProxyTime"] fast_num = len(proxy_df[proxy_df["CommitType"]==6]) print("fast commit ratio ", fast_num/len(proxy_df)) print("Bound ", proxy_df["Bound"].quantile(.5)) print("Proxy-E2E 50p ", proxy_df["E2E"].quantile(.5), "\t75p:", proxy_df["E2E"].quantile(.75), "\t90p:", proxy_df["E2E"].quantile(.9), "\t95p:", proxy_df["E2E"].quantile(.95)) # fast_df = proxy_df[proxy_df["SlowReplyTime"]==0].copy() # slow_df = proxy_df[proxy_df["SlowReplyTime"]>0].copy() # proxy_df['H1']=proxy_df['ProxyTime']-proxy_df["ClientTime"] # proxy_df['H2']=proxy_df['RecvTime']-proxy_df["ProxyTime"] # fast_df['F1']=fast_df['FastReplyTime']-fast_df["RecvTime"] # slow_df['HF1']=slow_df['SlowReplyTime']-slow_df["RecvTime"] # slow_df['HF3']=slow_df['SlowReplyTime']-slow_df["FastReplyTime"] # fast_df['H3']=fast_df['ProxyRecvTime']-fast_df["FastReplyTime"] # slow_df['H3']=slow_df['ProxyRecvTime']-slow_df["SlowReplyTime"] # fast_df['total'] = fast_df["ProxyRecvTime"] - fast_df["ClientTime"] # slow_df['total'] = slow_df["ProxyRecvTime"] - slow_df["ClientTime"] embed() ================================================ FILE: scripts/launch.py ================================================ import os import subprocess from subprocess import PIPE, Popen import time import ruamel.yaml from termcolor import colored import argparse LOGIN_PATH = "/home/steam1994" TAG = "opensource-test" SSH_KEY = "/home/steam1994/.ssh/id_rsa" ssh_identity = '-i {}'.format(SSH_KEY) if SSH_KEY else '' # Prefix for SSH and SCP. SSH = 'ssh {} -q -o ConnectTimeout=2 -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no '.format( ssh_identity) SCP = 'scp -r {} -q -o ConnectTimeout=2 -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no '.format( ssh_identity) USERNAME = "steam1994" CMD_RETRY_TIMES = 3 def generate_ttcs_cfg_file(internal_ip, is_reference=False, use_ntp=False): if is_reference: content_str = '''management_address: "InternalIP" log_dir: "/var/opt/ttcs/log" subscription_mode: true coordinator_address: "c-gjk1994gjk1994-c89e.gcp.clockwork.io" coordinator_subscription_service_port: 6176 probe_address: "InternalIP" clock_quality: 10 correct_clock: false''' cfg_file = content_str.replace("InternalIP", internal_ip) cfg_file_name = "ttcs-agent.cfg" with open(cfg_file_name, "w") as f: f.write(cfg_file) f.close() return cfg_file_name else: if use_ntp: content_str = '''management_address: "InternalIP" log_dir: "/var/opt/ttcs/log" subscription_mode: true coordinator_address: "c-gjk1994gjk1994-c89e.gcp.clockwork.io" coordinator_subscription_service_port: 6176 probe_address: "InternalIP" clock_quality: 1 correct_clock: false''' else: content_str = '''management_address: "InternalIP" log_dir: "/var/opt/ttcs/log" subscription_mode: true coordinator_address: "c-gjk1994gjk1994-c89e.gcp.clockwork.io" coordinator_subscription_service_port: 6176 probe_address: "InternalIP" clock_quality: 1 correct_clock: true''' cfg_file = content_str.replace("InternalIP", internal_ip) cfg_file_name = "ttcs-agent.cfg" with open(cfg_file_name, "w") as f: f.write(cfg_file) f.close() return cfg_file_name def retry_proc_error(procs_list): procs_error = [] for server, proc, cmd in procs_list: output, err = proc.communicate() if proc.returncode != 0: proc = Popen(cmd.split(), stdout=PIPE, stderr=PIPE) procs_error.append((server, proc, cmd)) return procs_error def start_ttcs_node(internal_ip, is_reference, use_ntp=False): clean_prev_deb_cmd = "sudo dpkg -P ttcs-agent" run_command([internal_ip], clean_prev_deb_cmd, in_background=False) install_deb_cmd = "sudo dpkg -i /home/steam1994/ttcs-agent_1.0.21_amd64.deb" #install_deb_cmd = "sudo dpkg -i /root/ttcs-agent_1.0.12_amd64.deb" run_command([internal_ip], install_deb_cmd, in_background=False) cfg_file = generate_ttcs_cfg_file(internal_ip, is_reference, use_ntp) local_file_path = "./ttcs-agent.cfg" remote_dir = "/etc/opt/ttcs" remote_path = remote_dir + "/ttcs-agent.cfg" chmod_cmd = "sudo chmod -R 777 {remote_dir}".format(remote_dir=remote_dir) run_command([internal_ip], chmod_cmd, in_background=False) rm_cmd = "sudo rm -f {remote_path}".format(remote_path=remote_path) run_command([internal_ip], rm_cmd, in_background=False) scp_files([internal_ip], local_file_path, remote_path, to_remote=True) if is_reference is not True and use_ntp is False: stop_ntp_cmd = "sudo systemctl stop ntp" run_command([internal_ip], stop_ntp_cmd, in_background=False) disable_ntp_cmd = "sudo systemctl disable ntp" run_command([internal_ip], disable_ntp_cmd, in_background=False) stop_ntp_cmd = "sudo systemctl stop chronyd" run_command([internal_ip], stop_ntp_cmd, in_background=False) disable_ntp_cmd = "sudo systemctl disable chronyd" run_command([internal_ip], disable_ntp_cmd, in_background=False) else: enable_ntp_cmd = "sudo systemctl enable chronyd" run_command([internal_ip], enable_ntp_cmd, in_background=False) start_ntp_cmd = "sudo systemctl start chronyd" run_command([internal_ip], start_ntp_cmd, in_background=False) sys_start_ttcp_agent_cmd = "sudo systemctl start ttcs-agent" run_command([internal_ip], sys_start_ttcp_agent_cmd, in_background=False) def launch_ttcs(server_ip_list): stop_ntp_cmd = "sudo systemctl stop chronyd" run_command(server_ip_list, stop_ntp_cmd, in_background=False) disable_ntp_cmd = "sudo systemctl disable chronyd" run_command(server_ip_list, disable_ntp_cmd, in_background=False) stop_ntp_cmd = "sudo systemctl stop ntp" run_command(server_ip_list, stop_ntp_cmd, in_background=False) disable_ntp_cmd = "sudo systemctl disable ntp" run_command(server_ip_list, disable_ntp_cmd, in_background=False) sys_start_ttcp_agent_cmd = "sudo systemctl start ttcs-agent" run_command(server_ip_list, sys_start_ttcp_agent_cmd, in_background=False) def scp_files(server_ip_list, local_path_to_file, remote_dir, to_remote): ''' copies the file in 'local_path_to_file' to the 'remote_dir' in all servers whose external ip addresses are in 'server_ip_list' args server_ip_list: list of external IP addresses to communicate with local_path_to_file: e.g. ./script.py remote_dir: e.g. ~ to_remote: whether to copy to remote (true) or vice versa (false) returns boolean whether operation was succesful on all servers or not ''' src = remote_dir if not to_remote else local_path_to_file src_loc = 'remote' if not to_remote else 'local' dst = remote_dir if to_remote else local_path_to_file dst_loc = 'remote' if to_remote else 'local' message = 'from ({src_loc}) {src} to ({dst_loc}) {dst}'.format( src_loc=src_loc, src=src, dst_loc=dst_loc, dst=dst) print('---- started scp {}'.format(message)) procs = [] for server in server_ip_list: if to_remote: cmd = '{} {} {}@{}:{}'.format(SCP, local_path_to_file, USERNAME, server, remote_dir) proc = Popen(cmd.split(), stdout=PIPE, stderr=PIPE) else: cmd = '{} {}@{}:{} {}'.format(SCP, USERNAME, server, remote_dir, local_path_to_file) proc = Popen(cmd.split(), stdout=PIPE, stderr=PIPE) # print("scp cmd ", cmd) procs.append((server, proc, cmd)) success = True procs_error = retry_proc_error(procs) retries = 1 while retries < CMD_RETRY_TIMES and procs_error: procs_error = retry_proc_error(procs) retries += 1 if retries >= CMD_RETRY_TIMES and procs_error: success = False for server, proc, cmd in procs_error: output, err = proc.communicate() if proc.returncode != 0: print( colored('[{}]: FAIL SCP - [{}]'.format(server, cmd), 'yellow')) print(colored('Error Response:', 'blue', attrs=['bold']), proc.returncode, output, err) if success: print( colored('---- SUCCESS SCP {} on {}'.format(message, str(server_ip_list)), 'green', attrs=['bold'])) else: print( colored('---- FAIL SCP {}'.format(message), 'red', attrs=['bold'])) return success def run_command(server_ip_list, cmd, in_background=True): ''' runs the command 'cmd' in all servers whose external ip addresses are in 'server_ip_list' cfg server_ip_list: list of external IP addresses to communicate with cmd: command to run returns boolean whether operation was succesful on all servers or not ''' if not in_background: print('---- started to run command - [{}] on {}'.format( cmd, str(server_ip_list))) else: print( colored('---- started to run [IN BACKGROUND] command - [{}] on {}'. format(cmd, str(server_ip_list)), 'blue', attrs=['bold'])) procs = [] for server in server_ip_list: ssh_cmd = '{} {}@{} {}'.format(SSH, USERNAME, server, cmd) proc = Popen(ssh_cmd.split(), stdout=PIPE, stderr=PIPE) procs.append((server, proc, ssh_cmd)) success = True output = '' if not in_background: procs_error = retry_proc_error(procs) retries = 1 while retries < CMD_RETRY_TIMES and procs_error: procs_error = retry_proc_error(procs) retries += 1 if retries >= CMD_RETRY_TIMES and procs_error: success = False for server, proc, cmd in procs_error: output, err = proc.communicate() if proc.returncode != 0: print( colored( '[{}]: FAIL run command - [{}]'.format( server, cmd), 'yellow')) print(colored('Error Response:', 'blue', attrs=['bold']), proc.returncode, output, err) if success: print( colored('---- SUCCESS run command - [{}] on {}'.format( cmd, str(server_ip_list)), 'green', attrs=['bold'])) else: print( colored('---- FAIL run command - [{}]'.format(cmd), 'red', attrs=['bold'])) return success, output def create_instance(instance_name, image=None, machine_type = "n1-standard-4", customzedZone = "us-central1-a", customzedIp = None, require_external_ip=False, second_ip = False ): # Construct gcloud command to create instance. network_address_config = ("--network-interface no-address" if require_external_ip == False else "") if customzedIp is not None: network_address_config += ",private-network-ip="+customzedIp if second_ip: network_address_config += " --network-interface subnet=subnet-1,no-address" # scopes = "--scopes storage-full,https://www.googleapis.com/auth/bigtable.admin,https://www.googleapis.com/auth/bigtable.data,https://www.googleapis.com/auth/bigquery" # if full_access_to_cloud_apis: scopes = "--scopes=https://www.googleapis.com/auth/cloud-platform" create_instance_cmd = """gcloud beta compute instances create {inst} --zone {zone} --image-family {source_image} --machine-type {machine_type} {network} {scopes} --boot-disk-size 50GB""".format( inst=instance_name, zone=customzedZone, source_image=image, machine_type=machine_type, network=network_address_config, scopes=scopes, ) # print(create_instance_cmd) # Run gcloud command to create machine. proc = Popen(create_instance_cmd, stdout=PIPE, stderr=PIPE, shell=True) # Wait for the process end and print error in case of failure output, error = proc.communicate() if proc.returncode != 0: print(colored("Failed to create instance", color="red", attrs=["bold"])) print(colored("Error Response: ", color="blue", attrs=["bold"]), output, error) def del_instance_list(instance_list, zone="us-central1-a"): for machine in instance_list: print(colored("Deleting "+machine, "red", attrs=['bold'])) subprocess.Popen( 'gcloud -q compute instances delete {inst} --zone {zone}'.format( inst=machine, zone=zone).split()) def stop_instance_list(instance_list, zone="us-central1-a"): stop_cmd = 'gcloud compute instances stop {inst} --zone {zone}'.format( inst=' '.join(instance_list), zone = zone ) print(stop_cmd) os.system(stop_cmd) def start_instance_list(instance_list, zone="us-central1-a"): start_cmd = 'gcloud compute instances start {inst} --zone {zone}'.format( inst=' '.join(instance_list), zone = zone ) print(start_cmd) os.system(start_cmd) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('--num_replicas', type=int, default = 3, help='Specify the number of replicas ') parser.add_argument('--num_proxies', type=int, default = 2, help='Specify the number of proxies ') parser.add_argument('--num_clients', type=int, default = 10, help='Specify the number of clients ') args = parser.parse_args() num_replicas = args.num_replicas num_proxies = args.num_proxies num_clients = args.num_clients print("replicas: ", num_replicas) print("proxies: ", num_proxies) print("clients: ", num_clients) # cfg_file_name = generate_ttcs_cfg_file("10.128.3.79", is_reference=True, use_ntp=False) replica_ips = ["10.128.2."+str(i+10) for i in range(3)] proxy_ips = ["10.128.2."+str(i+20) for i in range(3, 5) ] client_ips = ["10.128.2."+str(i+30) for i in range(5, 15) ] replica_ips = replica_ips[0:num_replicas] proxy_ips = proxy_ips[0:num_proxies] client_ips = client_ips[0:num_clients] replica_name_list = [TAG+"-replica-"+str(i) for i in range(num_replicas) ] proxy_name_list = [ TAG+"-proxy-"+str(i) for i in range(num_proxies) ] client_name_list = [ TAG+"-client-"+str(i) for i in range(num_clients) ] vm_ips = replica_ips + proxy_ips + client_ips vm_name_list = replica_name_list + proxy_name_list + client_name_list replica_vm_type = "n1-standard-16" proxy_vm_type = "n1-standard-32" client_vm_type = "n1-standard-4" binary_path = "{login_path}/nezhav2/bazel-bin/".format(login_path = LOGIN_PATH) config_path = "{login_path}/nezhav2/configs".format(login_path = LOGIN_PATH) yaml = ruamel.yaml.YAML() # for i in range(num_replicas): # create_instance(instance_name = replica_name_list[i], # image= "opensource-nezha", # machine_type = replica_vm_type, # customzedZone="us-central1-a", # customzedIp = replica_ips[i] ) # print(colored("Created "+replica_name_list[i], "green", attrs=['bold'])) # for i in range(num_proxies): # create_instance(instance_name = proxy_name_list[i], # image= "opensource-nezha", # machine_type = proxy_vm_type, # customzedZone="us-central1-a", # customzedIp = proxy_ips[i] ) # print(colored("Created "+proxy_name_list[i], "green", attrs=['bold'])) # for i in range(num_clients): # create_instance(instance_name = client_name_list[i], # image= "opensource-nezha", # machine_type = client_vm_type, # customzedZone="us-central1-a", # customzedIp = client_ips[i] ) # print(colored("Created "+client_name_list[i], "green", attrs=['bold'])) # time.sleep(120) # for i in range(len(vm_ips)): # start_ttcs_node(vm_ips[i],False) # exit(0) #### del_instance_list(instance_list=vm_name_list) # stop_instance_list(instance_list = vm_name_list) # exit(0) # start_instance_list(instance_list = vm_name_list) # time.sleep(60) # print(vm_ips) # launch_ttcs(vm_ips) # exit(0) # Generate configs for i in range(num_replicas): config_template = "{config_path}/nezha-replica-config-template.yaml".format(config_path = config_path) config_file = "{config_path}/nezha-replica-config-{idx}.yaml".format(config_path=config_path, idx =i) f = open(config_template, "r") yaml_data = yaml.load(f) yaml_data["replica-id"] = i yaml_data["replica-ips"] = replica_ips out_file = open(config_file, "w") yaml.indent(sequence=4, offset=2) yaml.dump(yaml_data, out_file) for i in range(num_proxies): config_template = "{config_path}/nezha-proxy-config-template.yaml".format(config_path = config_path) config_file = "{config_path}/nezha-proxy-config-{idx}.yaml".format(config_path=config_path, idx =i+1) f = open(config_template, "r") yaml_data = yaml.load(f) yaml_data["proxy-info"]["proxy-id"] = i + 1 yaml_data["proxy-info"]["proxy-ip"] = proxy_ips[i] yaml_data["replica-info"]["replica-ips"] = replica_ips out_file = open(config_file, "w") yaml.indent(sequence=4, offset=2) yaml.dump(yaml_data, out_file) for i in range(num_clients): config_template = "{config_path}/nezha-client-config-template.yaml".format(config_path = config_path) config_file = "{config_path}/nezha-client-config-{idx}.yaml".format(config_path = config_path, idx= i+1) f = open(config_template, "r") yaml_data = yaml.load(f) yaml_data["proxy-info"]["proxy-ips"] = proxy_ips yaml_data["client-info"]["client-id"] = i+1 yaml_data["client-info"]["client-ip"] = client_ips[i] out_file = open(config_file, "w") yaml.indent(sequence=4, offset=2) yaml.dump(yaml_data, out_file) # Copy config for i in range(num_replicas): config_file = "{config_path}/nezha-replica-config-{idx}.yaml".format(config_path=config_path, idx =i) scp_files([replica_ips[i]], config_file, config_file, to_remote = True) for i in range(num_proxies): config_file = "{config_path}/nezha-proxy-config-{idx}.yaml".format(config_path=config_path, idx =i+1) scp_files([proxy_ips[i]], config_file, config_file, to_remote = True) for i in range(num_clients): config_file = "{config_path}/nezha-client-config-{idx}.yaml".format(config_path = config_path, idx= i+1) scp_files([client_ips[i]], config_file, config_file, to_remote = True) # exit(0) remote_path = "{login_path}/nezhav2/bazel-bin/*".format(login_path = LOGIN_PATH) rm_cmd = "sudo rm -rf {remote_path}".format(remote_path=remote_path) run_command(vm_ips, rm_cmd, in_background=False) mkdir_cmd = "mkdir -p {binary_path}/replica".format(binary_path = binary_path) run_command(vm_ips, mkdir_cmd, in_background=False) mkdir_cmd = "mkdir -p {binary_path}/proxy".format(binary_path = binary_path) run_command(vm_ips, mkdir_cmd, in_background=False) mkdir_cmd = "mkdir -p {binary_path}/client".format(binary_path = binary_path) run_command(vm_ips, mkdir_cmd, in_background=False) binary_file = "{binary_path}/client/nezha_client".format(binary_path=binary_path) scp_files(vm_ips, binary_file, binary_file, to_remote = True) binary_file = "{binary_path}/replica/nezha_replica".format(binary_path=binary_path) scp_files(vm_ips, binary_file, binary_file, to_remote = True) binary_file = "{binary_path}/proxy/nezha_proxy".format(binary_path=binary_path) scp_files(vm_ips, binary_file, binary_file, to_remote = True) # Kill existing procs kill_cmd = "sudo pkill -9 replica" run_command(vm_ips, kill_cmd, in_background=False) kill_cmd = "sudo pkill -9 proxy" run_command(vm_ips, kill_cmd, in_background=False) kill_cmd = "sudo pkill -9 client" run_command(vm_ips, kill_cmd, in_background=False) ## Launch replicas (id starts from 0) for i in range(num_replicas): replica_cmd = "{binary_path}/replica/nezha_replica --config {config_path}/nezha-replica-config-{idx}.yaml > {log_file} 2>&1 &".format( binary_path = binary_path, config_path = config_path, idx =i, log_file = "replica-log-"+str(i) ) print(colored(replica_cmd, "yellow", attrs=['bold'])) run_command([replica_ips[i]], replica_cmd, in_background=False) # input("stop...") # Launch proxies (id starts from 1) for i in range(num_proxies): proxy_cmd = "{binary_path}/proxy/nezha_proxy --config {config_path}/nezha-proxy-config-{idx}.yaml > {log_file} 2>&1 &".format( binary_path = binary_path, config_path = config_path, idx = i+1, log_file = "proxy-log-"+str(i+1) ) print(colored(proxy_cmd, "yellow", attrs=['bold'])) run_command([proxy_ips[i]], proxy_cmd, in_background = False) # Launch clients (id starts from 2) for i in range(num_clients): client_cmd = "{binary_path}/client/nezha_client --config {config_path}/nezha-client-config-{idx}.yaml >{log_file} 2>&1 &".format( binary_path = binary_path, config_path = config_path, idx = i+1, log_file = "client-log-"+str(i+1) ) print(colored(client_cmd, "yellow", attrs=['bold'])) run_command([client_ips[i]], client_cmd, in_background = True) print("Sleep...") time.sleep(90) # Copy Stats File folder_name = "stats" stats_folder = "{login_path}/{folder_name}".format( login_path = LOGIN_PATH, folder_name = folder_name ) mkdir_cmd = "sudo mkdir -p -m 777 {stats_folder}".format(stats_folder = stats_folder) os.system(mkdir_cmd) for i in range(num_clients): file_name = "Client-Stats-"+str(i+1) local_file_path = "{stats_folder}/{file_name}".format( stats_folder = stats_folder, file_name = file_name ) remote_path = "{stats_folder}/{file_name}".format( stats_folder = LOGIN_PATH, file_name = file_name ) scp_files([client_ips[i]], local_file_path, remote_path, to_remote=False) for i in range(num_proxies): file_name = "Proxy-Stats-"+str(i+1)+".csv" local_file_path = "{stats_folder}/{file_name}".format( stats_folder = stats_folder, file_name = file_name ) remote_path = "{stats_folder}/{file_name}".format( stats_folder = LOGIN_PATH, file_name = file_name ) scp_files([proxy_ips[i]], local_file_path, remote_path, to_remote=False) ================================================ FILE: scripts/local_test.sh ================================================ #!/bin/bash export FLAGS_alsologtostderr=1 echo "Launching replica 0..." (./bazel-bin/replica/nezha_replica --config ./configs/local/nezha-replica-config-0.yam & ) echo "Launching replica 1..." (./bazel-bin/replica/nezha_replica --config ./configs/local/nezha-replica-config-1.yaml &) echo "Launching replica 2..." (./bazel-bin/replica/nezha_replica --config ./configs/local/nezha-replica-config-2.yaml &) echo "Launching proxy..." (./bazel-bin/proxy/nezha_proxy --config ./configs/local/nezha-proxy-config.yaml &) echo "Launching client..." ./bazel-bin/client/nezha_client --config ./configs/local/nezha-client-config.yaml # TODO(Katie): This is currently only checking if at least one request succeeded. # It does not check if the client/replica/proxy failed for some reason file="Client-Stats-1" if [ -e "$file" ]; then line_count=$(wc -l < "$file") if [ "$line_count" -le 1 ]; then echo "File '$file' exists but has only one line." echo "No successful requests." exit 1 else echo "Success. File '$file' exists and has more than one line." fi else echo "File '$file' does not exist." exit 1 fi # Exit gracefully for github actions. It's okay if there are stray replica processes. if [[ "$1" == "--github" ]]; then exit 0 fi # Kill replicas trap 'trap - SIGTERM && kill 0' SIGINT SIGTERM EXIT ================================================ FILE: scripts/ttcs-agent.cfg ================================================ management_address: "10.128.2.15" log_dir: "/var/opt/ttcs/log" subscription_mode: true coordinator_address: "c-gjk1994gjk1994-c89e.gcp.clockwork.io" coordinator_subscription_service_port: 6176 probe_address: "10.128.2.15" clock_quality: 1 correct_clock: true ================================================ FILE: third_party/concurrentqueue/BUILD.bazel ================================================ cc_library( name = "concurrentqueue", srcs = ["concurrentqueue.h"], visibility = [ "//visibility:public", ], ) ================================================ FILE: third_party/glog/BUILD.bazel ================================================ ================================================ FILE: third_party/glog/BUILD.glog ================================================ licenses(['notice']) load('@//third_party/glog:glog.bzl', 'glog_library') glog_library('') ================================================ FILE: third_party/glog/glog.bzl ================================================ """glog library build rule.""" load("@rules_cc//cc:defs.bzl", "cc_library") def glog_library(name, namespace = "google", with_gflags = 1): """Implement a macro glog_library() that the BUILD file can load. By default, glog is built with gflags support. You can change this behavior by using glog_library(with_gflags=0) This file is inspired by the following sample BUILD files: https://github.com/google/glog/issues/61 https://github.com/google/glog/files/393474/BUILD.txt Args: name: The name of the rule (this is not used; it only exists to silence the linter). namespace: Namespace to use. with_gflags: Build with gflags support. """ if native.repository_name() != "@": gendir = "$(GENDIR)/external/" + native.repository_name().lstrip("@") else: gendir = "$(GENDIR)" cc_library( name = "glog", visibility = ["//visibility:public"], srcs = [ ":config_h", "src/base/commandlineflags.h", "src/base/googleinit.h", "src/base/mutex.h", "src/demangle.cc", "src/demangle.h", "src/logging.cc", "src/raw_logging.cc", "src/signalhandler.cc", "src/stacktrace.h", "src/stacktrace_generic-inl.h", "src/stacktrace_libunwind-inl.h", "src/stacktrace_powerpc-inl.h", "src/stacktrace_windows-inl.h", "src/stacktrace_x86-inl.h", "src/stacktrace_x86_64-inl.h", "src/symbolize.cc", "src/symbolize.h", "src/utilities.cc", "src/utilities.h", "src/vlog_is_on.cc", ], hdrs = [ ":logging_h", ":raw_logging_h", ":stl_logging_h", ":vlog_is_on_h", "src/glog/log_severity.h", ], strip_include_prefix = "src", copts = [ # Disable warnings that exists in glog. "-Wno-sign-compare", "-Wno-unused-function", "-Wno-unused-local-typedefs", "-Wno-unused-variable", "-DGLOG_BAZEL_BUILD", # Inject a C++ namespace. "-DGOOGLE_NAMESPACE='%s'" % namespace, # Allows src/base/mutex.h to include pthread.h. "-DHAVE_PTHREAD", # Allows src/logging.cc to determine the host name. "-DHAVE_SYS_UTSNAME_H", # For src/utilities.cc. "-DHAVE_SYS_SYSCALL_H", "-DHAVE_SYS_TIME_H", "-DHAVE_STDINT_H", "-DHAVE_STRING_H", # Enable dumping stacktrace upon sigaction. "-DHAVE_SIGACTION", # For logging.cc. "-DHAVE_PREAD", "-DHAVE___ATTRIBUTE__", # Enable UNISTD_H for symlinking. "-DHAVE_UNISTD_H", # For stacktrace dumping. "-DHAVE_UNWIND_H", # Include generated header files. "-I%s/glog_internal" % gendir, ] + ([ # Use gflags to parse CLI arguments. "-DHAVE_LIB_GFLAGS", ] if with_gflags else []), deps = [ "@com_github_gflags_gflags//:gflags", ] if with_gflags else [], ) native.genrule( name = "gen_sh", outs = [ "gen.sh", ], cmd = r'''\ #!/bin/sh cat > $@ <<"EOF" sed -e 's/@ac_cv_cxx_using_operator@/1/g' \ -e 's/@ac_cv_have_unistd_h@/1/g' \ -e 's/@ac_cv_have_stdint_h@/1/g' \ -e 's/@ac_cv_have_systypes_h@/1/g' \ -e 's/@ac_cv_have_libgflags_h@/1/g' \ -e 's/@ac_cv_have_uint16_t@/1/g' \ -e 's/@ac_cv_have___builtin_expect@/1/g' \ -e 's/@ac_cv_have_.*@/0/g' \ -e 's/@ac_google_start_namespace@/namespace google {/g' \ -e 's/@ac_google_end_namespace@/}/g' \ -e 's/@ac_google_namespace@/google/g' \ -e 's/@ac_cv___attribute___noinline@/__attribute__((noinline))/g' \ -e 's/@ac_cv___attribute___noreturn@/__attribute__((noreturn))/g' \ -e 's/@ac_cv___attribute___printf_4_5@/__attribute__((__format__ (__printf__, 4, 5)))/g' EOF ''', ) native.genrule( name = "config_h", srcs = [ "src/config.h.cmake.in", ], outs = [ "glog_internal/config.h", ], cmd = "awk '{ gsub(/^#cmakedefine/, \"//cmakedefine\"); print; }' $< > $@", ) for f in ["vlog_is_on", "stl_logging", "raw_logging", "logging"]: native.genrule( name = "%s_h" % f, srcs = [ "src/glog/%s.h.in" % f, ], outs = [ "src/glog/%s.h" % f, ], cmd = "$(location :gen_sh) < $< > $@", tools = [":gen_sh"], ) ================================================ FILE: third_party/junction/BUILD.bazel ================================================ load("@rules_foreign_cc//foreign_cc:cmake.bzl", "cmake") filegroup( name = "all_srcs", srcs = glob(["**"]), visibility = ["//visibility:public"], ) cmake( name = "libjunction", lib_source = ":all_srcs", data = [ "@com_github_preshing_turf//:all_srcs"], visibility = ["//visibility:public"], out_static_libs = ["libjunction.a", "libturf.a"], ) ================================================ FILE: third_party/junction/junction.patch ================================================ diff --git a/CMakeLists.txt b/CMakeLists.txt index 93cf495..686aa50 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,8 +30,9 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules") # Add turf targets and import its macros since we use them below get_filename_component(outerPath "${CMAKE_CURRENT_LIST_DIR}/.." ABSOLUTE) -set(TURF_ROOT "${outerPath}/turf" CACHE STRING "Path to Turf") +set(TURF_ROOT "${outerPath}/com_github_preshing_turf" CACHE STRING "Path to Turf") include("${TURF_ROOT}/cmake/Macros.cmake") +message(TURF_ROOT="${TURF_ROOT}") if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) # If this is the root project, apply build settings here so that # they're applied to all targets ================================================ FILE: third_party/libev/BUILD.bazel ================================================ load("@rules_foreign_cc//foreign_cc:configure.bzl", "configure_make") filegroup( name = "all_srcs", srcs = glob(["**"]), visibility = ["//visibility:public"], ) configure_make( name = "libev", lib_source = ":all_srcs", visibility = ["//visibility:public"], ) ================================================ FILE: third_party/openssl/BUILD.bazel ================================================ load("@rules_foreign_cc//foreign_cc:configure.bzl", "configure_make") filegroup( name = "all_srcs", srcs = glob(["**"]), visibility = ["//visibility:public"], ) configure_make( name = "openssl", configure_command = "config", configure_options = [ "no-shared", ], lib_source = ":all_srcs", out_static_libs = [ "libssl.a", "libcrypto.a", ], visibility = ["//visibility:public"], ) ================================================ FILE: third_party/turf/BUILD.bazel ================================================ load("@rules_foreign_cc//foreign_cc:cmake.bzl", "cmake") filegroup( name = "all_srcs", srcs = glob(["**"]), visibility = ["//visibility:public"], ) ================================================ FILE: ttcs-agent.cfg ================================================ management_address: "10.128.2.13" log_dir: "/var/opt/ttcs/log" subscription_mode: true coordinator_address: "c-gjk1994gjk1994-c89e.gcp.clockwork.io" coordinator_subscription_service_port: 6176 probe_address: "10.128.2.13" clock_quality: 1 correct_clock: true