Full Code of Steamgjk/Nezha for AI

main 8db31f04af48 cached
78 files
416.5 KB
112.4k tokens
102 symbols
1 requests
Download .txt
Showing preview only (440K chars total). Download the full file or copy to clipboard to get everything.
Repository: Steamgjk/Nezha
Branch: main
Commit: 8db31f04af48
Files: 78
Total size: 416.5 KB

Directory structure:
gitextract_rknzum1x/

├── .github/
│   └── workflows/
│       └── build.yaml
├── .gitignore
├── .vscode/
│   └── settings.json
├── README.md
├── WORKSPACE
├── client/
│   ├── BUILD
│   ├── client.cc
│   ├── client.h
│   ├── client_config.h
│   └── client_run.cc
├── configs/
│   ├── dist/
│   │   ├── nezha-client-config.yaml
│   │   ├── nezha-proxy-config.yaml
│   │   ├── nezha-replica-config-0.yaml
│   │   ├── nezha-replica-config-1.yaml
│   │   ├── nezha-replica-config-2.yaml
│   │   └── nezha-replica-config.yaml
│   ├── local/
│   │   ├── nezha-client-config.yaml
│   │   ├── nezha-proxy-config.yaml
│   │   ├── nezha-replica-config-0.yaml
│   │   ├── nezha-replica-config-1.yaml
│   │   └── nezha-replica-config-2.yaml
│   ├── nezha-client-config-template.yaml
│   ├── nezha-proxy-config-template.yaml
│   └── nezha-replica-config-template.yaml
├── docs/
│   ├── Nezha.tla
│   ├── demo.md
│   └── tla-intro.md
├── external/
│   ├── gogoprotobuf.BUILD
│   └── googleapi.BUILD
├── lib/
│   ├── BUILD
│   ├── Rules.mk
│   ├── address.cc
│   ├── address.h
│   ├── common_struct.h
│   ├── common_type.h
│   ├── endpoint.cc
│   ├── endpoint.h
│   ├── message_handler.h
│   ├── message_type.cc
│   ├── message_type.h
│   ├── timer.h
│   ├── udp_socket_endpoint.cc
│   ├── udp_socket_endpoint.h
│   ├── utils.cc
│   ├── utils.h
│   └── zipfian.h
├── license.md
├── micro-bench/
│   ├── BUILD
│   ├── analysis.cc
│   ├── bench_receiver.cc
│   ├── bench_sender.cc
│   └── launch_micro.py
├── proto/
│   ├── BUILD
│   └── nezha_proto.proto
├── proxy/
│   ├── BUILD
│   ├── proxy.cc
│   ├── proxy.h
│   ├── proxy_config.h
│   └── proxy_run.cc
├── replica/
│   ├── BUILD
│   ├── replica.cc
│   ├── replica.h
│   ├── replica_config.h
│   └── replica_run.cc
├── scripts/
│   ├── analysis.py
│   ├── launch.py
│   ├── local_test.sh
│   └── ttcs-agent.cfg
├── third_party/
│   ├── concurrentqueue/
│   │   └── BUILD.bazel
│   ├── glog/
│   │   ├── BUILD.bazel
│   │   ├── BUILD.glog
│   │   └── glog.bzl
│   ├── junction/
│   │   ├── BUILD.bazel
│   │   └── junction.patch
│   ├── libev/
│   │   └── BUILD.bazel
│   ├── openssl/
│   │   └── BUILD.bazel
│   └── turf/
│       └── BUILD.bazel
└── ttcs-agent.cfg

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/build.yaml
================================================
# Bazel action to build & test specific targets.
name: Bazel build

on:
  push:
    branches: [main]
  pull_request:
    branches: [main]

jobs:
  build:
    name: Bazel build and run local test
    runs-on: ubuntu-20.04
    steps:
      - uses: actions/checkout@v3

      - name: Setup Bazel
        run: |
          sudo apt install -y apt-transport-https curl gnupg
          curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor >bazel-archive-keyring.gpg
          sudo mv bazel-archive-keyring.gpg /usr/share/keyrings
          echo "deb [arch=amd64 signed-by=/usr/share/keyrings/bazel-archive-keyring.gpg] https://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list
          sudo apt update
          sudo apt install -y bazel-5.2.0
          sudo mv /usr/bin/bazel-5.2.0 /usr/bin/bazel

      - name: Build
        run: |
          bazel build //replica/... //proxy/... //client/...
      - name: Run local test
        run: ./scripts/local_test.sh --github


================================================
FILE: .gitignore
================================================
/.obj
/.bin
/bazel-*


================================================
FILE: .vscode/settings.json
================================================
{
    "C_Cpp.formatting": "clangFormat",
    "C_Cpp.clang_format_fallbackStyle": "{BasedOnStyle: Google, IncludeBlocks: Preserve, DerivePointerAlignment: false, PointerAlignment: Left}",
    "editor.formatOnSave": true,
    "files.associations": {
        "*.inc": "cpp",
        "cctype": "cpp",
        "clocale": "cpp",
        "cmath": "cpp",
        "cstdarg": "cpp",
        "cstddef": "cpp",
        "cstdio": "cpp",
        "cstdlib": "cpp",
        "cstring": "cpp",
        "ctime": "cpp",
        "cwchar": "cpp",
        "cwctype": "cpp",
        "array": "cpp",
        "atomic": "cpp",
        "bit": "cpp",
        "*.tcc": "cpp",
        "bitset": "cpp",
        "chrono": "cpp",
        "cinttypes": "cpp",
        "condition_variable": "cpp",
        "cstdint": "cpp",
        "deque": "cpp",
        "list": "cpp",
        "map": "cpp",
        "set": "cpp",
        "unordered_map": "cpp",
        "unordered_set": "cpp",
        "vector": "cpp",
        "exception": "cpp",
        "algorithm": "cpp",
        "functional": "cpp",
        "iterator": "cpp",
        "memory": "cpp",
        "memory_resource": "cpp",
        "numeric": "cpp",
        "optional": "cpp",
        "random": "cpp",
        "ratio": "cpp",
        "regex": "cpp",
        "string": "cpp",
        "string_view": "cpp",
        "system_error": "cpp",
        "tuple": "cpp",
        "type_traits": "cpp",
        "utility": "cpp",
        "fstream": "cpp",
        "initializer_list": "cpp",
        "iomanip": "cpp",
        "iosfwd": "cpp",
        "iostream": "cpp",
        "istream": "cpp",
        "limits": "cpp",
        "mutex": "cpp",
        "new": "cpp",
        "ostream": "cpp",
        "shared_mutex": "cpp",
        "sstream": "cpp",
        "stdexcept": "cpp",
        "streambuf": "cpp",
        "thread": "cpp",
        "typeinfo": "cpp",
        "csignal": "cpp",
        "any": "cpp",
        "cfenv": "cpp",
        "forward_list": "cpp",
        "future": "cpp",
        "scoped_allocator": "cpp",
        "typeindex": "cpp",
        "valarray": "cpp",
        "variant": "cpp",
        "hash_map": "cpp",
        "hash_set": "cpp",
        "*.ipp": "cpp",
        "csetjmp": "cpp",
        "strstream": "cpp",
        "charconv": "cpp",
        "codecvt": "cpp",
        "complex": "cpp",
        "source_location": "cpp",
        "rope": "cpp",
        "slist": "cpp"
    }
}

================================================
FILE: README.md
================================================
# Nezha

<img src="docs/nezha-img.jpeg" width="200">

----
Nezha (哪吒) is a legendary figure in Chinese mythology. Nezha has 3 heads and 6 arms, so he/she achieves much better fault tolerance than ordinary people :)

PS: We have created [[an FAQ page](https://github.com/Steamgjk/Nezha/wiki)]. Please take a look for a better understanding of Nezha.

## Paper and Presentation
Nezha: Deployable and High-Performance Consensus Using Synchronized Clocks (VLDB version) [[pdf](https://www.vldb.org/pvldb/vol16/p629-geng.pdf)]


Nezha: Deployable and High-Performance Consensus Using Synchronized Clocks(Technical Report) [[pdf](docs/Nezha-technical-report.pdf)]


An early presentation of Nezha was made at [Stanford Platform Lab Winter Review 2022](https://platformlab.stanford.edu/winter-review/platform-lab-winter-review-2022/) [[slides](https://platformlab.stanford.edu/wp-content/uploads/2022/03/Jinkun-Geng.pdf)]


If you find our work helpful to your research or project, we would very appreciate it if you could **add a star** to our repo and/or **cite our papers**. The bibs for the papers are as below. 

```
@article{vldb23-nezha,
author = {Geng, Jinkun and Sivaraman, Anirudh and Prabhakar, Balaji and Rosenblum, Mendel}, 
title = {Nezha: Deployable and High-Performance Consensus Using Synchronized Clocks},
year = {2023},
journal = {Proceedings of the VLDB Endowment},
url = {https://www.vldb.org/pvldb/vol16/p629-geng.pdf},
publisher = {VLDB Endowment},
issn = {2150-8097},
volume = {16},
pages = {629-642},
numpages = {14}
}

@misc{nezha-tech,
  author = {Geng, Jinkun and Sivaraman, Anirudh and Prabhakar, Balaji and Rosenblum, Mendel},
  title = {Nezha: Deployable and High-Performance Consensus Using Synchronized Clocks},
  doi = {10.48550/ARXIV.2206.03285},
  url = {https://arxiv.org/abs/2206.03285},
  publisher = {arXiv},
  year = {2022},
}

```


## Clone Project

```
git clone --depth=1 https://github.com/Steamgjk/Nezha.git
```


## File Structure
The core part includes three modules (folders), i.e., 
- replica
- proxy
- client 

Each module is composed of three files: 
- a header file (e.g., replica.h), 
- a source implementation file (replica.cc), 
- a launching file (e.g., replica_run.cc). 

Each process reads an independent yaml file (e.g., nezha-replica-config-0.yaml) to get its full configuration, the sample configuration files are placed in the configs folder



## Install Bazel

We use Bazel 5.2.0 for building Nezha.

```
# Install bazel 5.2.0
# Please follow the instructions at https://bazel.build/install/ubuntu#install-on-ubuntu, 
# or simply run the following commands

sudo apt install -y apt-transport-https curl gnupg
curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor >bazel-archive-keyring.gpg
sudo mv bazel-archive-keyring.gpg /usr/share/keyrings
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/bazel-archive-keyring.gpg] https://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list
sudo apt update
sudo apt install -y bazel-5.2.0
sudo mv /usr/bin/bazel-5.2.0 /usr/bin/bazel
bazel --version
```

## Build Nezha with Bazel

Since Bazel is becoming popular, we have migrated nezha from Makefile-based building system to the bazel building system. The bazel version in use is 5.2.0

```
cd Nezha && bazel build //replica/... //client/... //proxy/...
```


After building the project successfully, the executable files will be generated in the folder named `bazel-bin`



## Single-Machine Tests

Please refer to [the single-machine instructions](docs/demo.md) to run Nezha under various scenarios (view change, request commit, recovery from failure of replica).

## Multi-Machine Tests

We use [scripts/launch.py](scripts/launch.py) to conduct distributed tests across multiple machines. After the tests have completed, [scripts/analysis.py](scripts/analysis.py) is used to analyze the results to generate performance numbers. The current scripts only support Google Cloud Platform (GCP). They require GCP credentials to create and delete VMs on GCP.


## Important Configuration Parameters
### Replica
- ```replica-ips``` must include 2f+1 ips
- ```replica-id``` starts from 0 to 2f
- ```index-transfer-batch```, ```request-key-transfer-batch```, ```request-transfer-batch```. The values of the three <em>batch parameters</em> should be carefully chosen in order not to overflow the [maximum size of UDP packets](https://stackoverflow.com/questions/1098897/what-is-the-largest-safe-udp-packet-size-on-the-internet). 

### Clients
- We support two types of clients, i.e., open-loop clients and closed-loop clients.
- Open-loop clients generate requests according to a Poisson process configured with a specific rate.
- Closed-loop clients use a sliding window protocol to keep a fixed number of requests in flight at any given time, release a new request when an old one is completed.
- ```is-openloop```:  When this flag is true, --poission-rate becomes meaningful.
- ```skew-factor``` and key-number decides the workload, which further affects the commutativity optimization

### Proxy
- ```shard-num``` decides how many threads will be launched. 1 shard includes 1 forwarding thread to forward client requests to replicas and 1 replying thread to receive and replies from replicas and does quorum check
- ```max-owd```  is used in the clamping function to estimate one-way delay, more details are described in Sec 4 [Adpative latency bound] of the paper.

## Performance Benchmark
Refer to [our paper](https://arxiv.org/pdf/2206.03285.pdf) for the relevant performance stats. Compared with the experimental version, we have refactored the codebase with some higher-performance libraries (e.g. libev instead of libevent) and data structures (e.g., ConcurrentMap and ConcurrentQueue). Besides, we have also conducted further optimization with the pipeline. The performance will be somewhat better than the original version used in the paper. New benchmark data will be updated soon. 


## Authors and Acknowledgment
Nezha project is developed and maintained by [Jinkun Geng](https://steamgjk.github.io/) and his three supervisors, i.e., [Prof. Anirudh Sivaraman](https://cs.nyu.edu/~anirudh/), [Prof. Balaji Prabhakar](https://web.stanford.edu/~balaji/) and [Prof. Mendel Rosenblum](http://web.stanford.edu/~mendel/).

We are fortunate to get the help from many researchers during the development of Nezha. Below we list and acknowledge them according to the timeline.

[Dr. Shiyu Liu](https://web.stanford.edu/~shiyuliu/) and [Dr. Feiran Wang](https://www.linkedin.com/in/feiran-wang/) joined the discussion during the early design of Nezha. Feiran explained the details of CRaft and the related correctness properties.  Shiyu explained the principles of Huygens and the other clock sync solutions.

[Prof. Dan Ports](https://drkp.net/), [Prof. Jialin Li](https://www.comp.nus.edu.sg/~lijl/) and [Dr. Ellis Michael](https://ellismichael.com/) provided helpful discussion related to Speculative Paxos and NOPaxos. Dan also gave us the pointer to crash vector and diskless recovery. 

[Prof. Jinyang Li](http://www.news.cs.nyu.edu/~jinyang/) listened to our early presentation of Nezha, and gave some useful feedback.

[Prof. Seo Jin Park](https://seojinpark.net/) discussed with us about the definition of linearizability and other correctness properties. Seo Jin also provided some explanation about CURP.

[Prof. Zhaoguo Wang](https://ipads.se.sjtu.edu.cn/pub/members/zhaoguo_wang) shared with us his experience in testing Raft.

The [Derecho team](https://derecho-project.github.io/) (Prof. Ken Birman, Dr. Weijia Song, Dr. Sagar Jha, Dr. Lorenzo Rosa, etc) offered technical support and discussion during our measurement of Derecho.

The [ClockWork](https://www.clockwork.io/) Staff (Dr. Yilong Geng and Dr. Deepak Merugu) offered technical support in deploying Huygens. Dr. Deepak Merugu also gave suggestions on the coding-styles of Nezha codebase. Katie Gioioso provided feedback on Nezha design. Bhagirath Mehta participated in the single-machine test of Nezha.

[Prof. Eugene Wu](http://www.cs.columbia.edu/~ewu/) provided suggestions on the revision of Nezha paper.

[Prof. Aurojit Panda](https://cs.nyu.edu/~apanda/) discussed with us about Nezha's correctness during leader change. Aurojit reviewed our draft and offered some constructive suggestions on the revision.

The [Raft community](https://groups.google.com/u/1/g/raft-dev/c/SmnAvZMufB0) offered much insightful discussion for us. Many community members discussed with us and helped to justify our design decisions about Nezha.




## License
Please refer to [license.md](license.md)

## Future Plan

(1) Conduct more functionality and performance tests to make Nezha more robust and optimized

(3) Replace [the etcd backend for Kubenetes](https://learnk8s.io/etcd-kubernetes) to boost the performance of Kubenetes.



================================================
FILE: WORKSPACE
================================================
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository", "new_git_repository")

http_archive(
    name = "rules_proto",
    sha256 = "e017528fd1c91c5a33f15493e3a398181a9e821a804eb7ff5acdd1d2d6c2b18d",
    strip_prefix = "rules_proto-4.0.0-3.20.0",
    urls = [
        "https://github.com/bazelbuild/rules_proto/archive/refs/tags/4.0.0-3.20.0.tar.gz",
    ],
)
load("@rules_proto//proto:repositories.bzl", "rules_proto_dependencies", "rules_proto_toolchains")
rules_proto_dependencies()
rules_proto_toolchains()


http_archive(
    name = "com_github_grpc_grpc",
    sha256 = "9f387689b7fdf6c003fd90ef55853107f89a2121792146770df5486f0199f400",
    urls = [
        "https://github.com/grpc/grpc/archive/refs/tags/v1.42.0.zip",
    ],
    strip_prefix = "grpc-1.42.0",
)
load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
grpc_deps()
load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
grpc_extra_deps()


http_archive(
    name = "googleapi",
    sha256 = "3ff2365822fb573cb1779ada5c2ac7899269cacd0836aef95ffe9d95779031f2",
    url = "https://github.com/googleapis/googleapis/archive/refs/tags/common-protos-1_3_1.zip", 
    strip_prefix = "googleapis-common-protos-1_3_1/",
    build_file="@//external:googleapi.BUILD",
)


http_archive(
    name = "etcd",
    sha256 = "580ce584dc7628efebb57f8c8240674918d334ad21e33186bbc5f6348f465bc1",
    url = "https://github.com/etcd-io/etcd/archive/refs/tags/v3.5.0.zip", 
    strip_prefix = "etcd-3.5.0/",
    build_file="@//external:etcd.BUILD",
)



http_archive(
    name = "gogoprotobuf",
    sha256 = "f89f8241af909ce3226562d135c25b28e656ae173337b3e58ede917aa26e1e3c",
    url = "https://github.com/gogo/protobuf/archive/refs/tags/v1.3.2.zip", 
    strip_prefix = "protobuf-1.3.2/",
    build_file="@//external:gogoprotobuf.BUILD",
)

git_repository(
    name = "com_github_jbeder_yaml_cpp",
    commit = "fcbb8193b94921e058be7b563aea053531e5b2d9",  # 19-Aug-2023
    remote = "https://github.com/jbeder/yaml-cpp.git",
    shallow_since = "1692473776 -0400",
)

new_git_repository(
    name = "com_github_cameron314_concurrentqueue",
    build_file = "//third_party/concurrentqueue:BUILD.bazel",
    commit = "6dd38b8a1dbaa7863aa907045f32308a56a6ff5d",
    shallow_since = "1686439287 -0400",
    remote = "https://github.com/cameron314/concurrentqueue.git",
)

new_git_repository(
    name = "com_github_preshing_junction",
    commit = "5ad3be7ce1d3f16b9f7ed6065bbfeacd2d629a08",
    shallow_since = "1518982100 -0500",
    patches = ["//third_party/junction:junction.patch"],
    patch_args = ["-p1"],
    build_file = "//third_party/junction:BUILD.bazel",
    remote = "https://github.com/preshing/junction",
)

new_git_repository(
    name = "com_github_preshing_turf",
    commit = "9ae0d4b984fa95ed5f823274b39c87ee742f6650", 
    shallow_since = "1484317994 -0500" ,
    build_file = "//third_party/turf:BUILD.bazel",
    remote = "https://github.com/preshing/turf",
)

new_git_repository(
    name = "com_github_enki_libev",
    commit = "93823e6ca699df195a6c7b8bfa6006ec40ee0003",
    shallow_since = "1463172876 -0700",
    build_file = "//third_party/libev:BUILD.bazel",
    remote = "https://github.com/enki/libev.git",
)

# Google gflags.
git_repository(
    name = "com_github_gflags_gflags",
    commit = "e171aa2d15ed9eb17054558e0b3a6a413bb01067",  # 11-Nov-2018
    remote = "https://github.com/gflags/gflags.git",
    shallow_since = "1541971260 +0000",
)

# Google glog.
new_git_repository(
    name = "com_github_google_glog",
    build_file = "//third_party/glog:BUILD.glog",
    commit = "ba8a9f6952d04d1403b97df24e6836227751454e",  # 7-May-2019
    remote = "https://github.com/google/glog.git",
    # Shallow since doesn't work here for some weird reason. See
    # https://github.com/bazelbuild/bazel/issues/10292
    # shallow_since = "1557212520 +0000",
)

# Google protobuf.
git_repository(
    name = "com_google_protobuf",
    commit = "21027a27c4c2ec1000859ccbcfff46d83b16e1ed",  # 21-Apr-2022, v3.20.1
    remote = "https://github.com/protocolbuffers/protobuf",
    shallow_since = "1650589240 +0000",
)

http_archive(
    name = "rules_foreign_cc",
    sha256 = "2a8000ce03dd9bb324bc9bb7f1f5d01debac406611f4d9fedd385192718804f0",
    strip_prefix = "rules_foreign_cc-60813d57a0e99be1a009c1a0e9627cdbe81fcd19",
    url = "https://github.com/bazelbuild/rules_foreign_cc/archive/60813d57a0e99be1a009c1a0e9627cdbe81fcd19.tar.gz",
)

load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies")

rules_foreign_cc_dependencies()

http_archive(
    name = "openssl",
    build_file = "//third_party/openssl:BUILD.bazel",
    sha256 = "23011a5cc78e53d0dc98dfa608c51e72bcd350aa57df74c5d5574ba4ffb62e74",
    strip_prefix = "openssl-OpenSSL_1_1_1d",
    urls = ["https://github.com/openssl/openssl/archive/OpenSSL_1_1_1d.tar.gz"],
)

http_archive(
    name = "com_github_nelhage_rules_boost",
    url = "https://github.com/nelhage/rules_boost/archive/96e9b631f104b43a53c21c87b01ac538ad6f3b48.tar.gz",
    strip_prefix = "rules_boost-96e9b631f104b43a53c21c87b01ac538ad6f3b48",
    sha256 = "5ea00abc70cdf396a23fb53201db19ebce2837d28887a08544429d27783309ed",
)
load("@com_github_nelhage_rules_boost//:boost/boost.bzl", "boost_deps")
boost_deps()


================================================
FILE: client/BUILD
================================================
load("@rules_proto//proto:defs.bzl", "proto_library")

cc_library(
    name = "client_config",
    hdrs = ["client_config.h"],
    deps = [
        "@com_github_jbeder_yaml_cpp//:yaml-cpp",
    ],
)

cc_library(
    name = "client_class",
    srcs = ["client.cc"],
    hdrs = ["client.h"],
    deps = [
        "//proto:nezha_cc_proto",
        "//lib:zipfian",
        "//lib:utils",
        ":client_config",
    ],
)


cc_binary(
    name = "nezha_client",
    srcs = ["client_run.cc"],
    deps = [
        ":client_class",
    ],
)


================================================
FILE: client/client.cc
================================================
#include "client/client.h"

namespace nezha {
Client::Client(const std::string& configFile) {
  hop3s.reserve(500000);
  hop4s.reserve(500000);
  totals.reserve(500000);

  LOG(INFO) << "Loading config information from " << configFile;
  std::string error = clientConfig_.parseConfig(configFile);
  if (error != "") {
    LOG(ERROR) << "Error loading client config: " << error << " Exiting.";
    exit(1);
  }
  clientId_ = clientConfig_.clientId;
  LOG(INFO) << "clientId=" << clientId_;
  std::string clientIP = clientConfig_.clientIp;
  LOG(INFO) << "clientIP=" << clientIP;
  int requestPort = clientConfig_.requestPort;
  LOG(INFO) << "requestPort=" << requestPort;
  LOG(INFO) << "endPointType=" << clientConfig_.endpointType;
  requestEP_ =
      CreateEndpoint(clientConfig_.endpointType, clientIP, requestPort, true);
  replyHandler_ = CreateMsgHandler(
      clientConfig_.endpointType,
      [](MessageHeader* msgHdr, char* msgBuffer, Address* sender, void* ctx) {
        ((Client*)ctx)->ReceiveReply(msgHdr, msgBuffer, sender);
      },
      this);

  monitorTimer_ = new Timer(
      [](void* ctx, void* receiverEP) {
        // LOG(INFO) << "Monitor running " << ((Client*)ctx)->running_;
        if (((Client*)ctx)->running_ == false) {
          ((Endpoint*)receiverEP)->LoopBreak();
        }
      },
      10 /*Checks the status every 10ms*/, this);

  /** Fetch the addreses of all proxies and organize them as a two-dimensional
   * vector */
  proxyAddrs_.resize(clientConfig_.proxyIps.size());
  for (uint32_t i = 0; i < proxyAddrs_.size(); i++) {
    proxyAddrs_[i].resize(clientConfig_.proxyShardNum);
    for (uint32_t j = 0; j < proxyAddrs_[i].size(); j++) {
      proxyAddrs_[i][j] = new Address(clientConfig_.proxyIps[i],
                                      clientConfig_.proxyRequestPortBase + j);
    }
  }

  /** If the client is a open-loop client, generate the poission trace for the
   * client */
  if (clientConfig_.isOpenLoop) {
    poissonRate_ = clientConfig_.poissonRate;
    LOG(INFO) << "OpenLoop Client rate=" << poissonRate_;
    poissonTrace_.resize(1000, 0);
    std::default_random_engine generator(clientId_);  // clientId as the seed
    std::poisson_distribution<int> distribution(poissonRate_);
    for (int i = 0; i < 1000; i++) {
      int reqNum = distribution(generator);
      if (reqNum < 0) {
        poissonTrace_[i] = 0;
      } else {
        poissonTrace_[i] = reqNum;
      }
    }
  }
  /** Generate zipfian workload */
  LOG(INFO) << "keyNum=" << clientConfig_.keyNum
            << "\tskewFactor=" << clientConfig_.skewFactor
            << "\twriteRatio=" << clientConfig_.writeRatio;
  zipfianKeys_.resize(1000000, 0);
  retryTimeoutUs_ = clientConfig_.requestRetryTimeUs;
  if (clientConfig_.keyNum > 1) {
    std::default_random_engine generator(clientId_);  // clientId as the seed
    zipfian_int_distribution<uint32_t> zipfianDistribution(
        0, clientConfig_.keyNum - 1, clientConfig_.skewFactor);
    for (uint32_t i = 0; i < zipfianKeys_.size(); i++) {
      zipfianKeys_[i] = zipfianDistribution(generator);
    }
  }

  /** Initialize */
  committedReqId_ = 0;
  reclaimedReqId_ = 0;
  nextReqId_ = 1;
  retryNumber_ = 0;
  committedNum_ = 0;
  fastCommitNum_ = 0;
  fastWriteNum_ = 0;
}

void Client::Run() {
  running_ = true;
  LaunchThreads();
  for (auto& kv : threadPool_) {
    LOG(INFO) << "Join " << kv.first;
    kv.second->join();
    LOG(INFO) << "Join Complete " << kv.first;
  }
  LOG(INFO) << "Run Terminated ";
}

void Client::LaunchThreads() {
  threadPool_["LogTd"] = new std::thread(&Client::LogTd, this);
  threadPool_["ProcessReplyTd"] =
      new std::thread(&Client::ProcessReplyTd, this);
  if (clientConfig_.isOpenLoop) {
    LOG(INFO) << "OpenLoop Client";
    threadPool_["OpenLoopSubmissionTd"] =
        new std::thread(&Client::OpenLoopSubmissionTd, this);
  } else {
    LOG(INFO) << "ClosedLoop Client";
    threadPool_["CloseLoopSubmissionTd"] =
        new std::thread(&Client::CloseLoopSubmissionTd, this);
  }
}

void Client::ProcessReplyTd() {
  /** Register the message handler and timer. Then this thread will run in an
   * event-driven mode, i.e, when message comes, it calls the registered message
   * handler */
  requestEP_->RegisterMsgHandler(replyHandler_);
  requestEP_->RegisterTimer(monitorTimer_);
  LOG(INFO) << "Loop Run ";
  requestEP_->LoopRun();
  LOG(INFO) << "Loop Run Exit ";
}

void Client::ReceiveReply(MessageHeader* msgHdr, char* msgBuffer,
                          Address* sender) {
  if (msgHdr->msgLen < 0) {
    return;
  }
  Reply reply;
  if (msgHdr->msgType == MessageType::COMMIT_REPLY &&
      reply.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
    committedNum_++;
    if (reply.replytype() == MessageType::FAST_REPLY) {
      fastCommitNum_++;
      if (reply.iswrite()) {
        fastWriteNum_++;
      }
    }

    // if (committedNum_ % 100000 == 0) {
    //   LOG(INFO) << "commitNum=" << committedNum_
    //             << "\tfastWriteNum_=" << fastWriteNum_
    //             << "\tFastCommitNum=" << fastCommitNum_ <<
    //             "\tWriteRatioCommit="
    //             << (fastWriteNum_ * 100.0 / fastCommitNum_)
    //             << "\t fastRatio=" << (fastCommitNum_ * 100.0 /
    //             committedNum_);
    // }

    if (committedReqId_ < reply.reqid()) {
      committedReqId_ = reply.reqid();
      // // LOG(INFO) << "committedReqId_=" << committedReqId_;
      // uint64_t st = outstandingRequestSendTime_.get(reply.reqid());
      // uint64_t et = GetMicrosecondTimestamp();
      // ls.push_back((et - st));
      // if (ls.size() >= 1000) {
      //   for (uint32_t i = 0; i < 1000; i++) {
      //     printf("%u\t", ls[i]);
      //     if (i % 20 == 0) {
      //       printf("\n");
      //     }
      //   }
      //   exit(0);
      // }
    }
    uint64_t sendTime = outstandingRequestSendTime_.get(reply.reqid());
    if (sendTime > 0) {
      /** The corresponding request has not been committed, because it is still
       * in outstandingRequestSendTime_, so we wan to mark it as committed,
       * i.e., erase from outstandingRequestSendTime_
       */

      /**
       * Generate log information and pass to logQu_, which will be handled by
       * LogTd
       * */
      uint64_t recvTime = GetMicrosecondTimestamp();
      LogInfo* log = new LogInfo();
      lastCommittedReqId_ = reply.reqid();
      *log = {reply.reqid(), sendTime, recvTime, reply.replytype()};
      outstandingRequestSendTime_.erase(reply.reqid());
      logQu_.enqueue(log);
    }
  }
}

void Client::OpenLoopSubmissionTd() {
  int roundRobinIdx = 0;
  uint64_t startTime = GetMicrosecondTimestamp();
  uint64_t endTime = startTime + clientConfig_.durationSec * 1000000;

  srandom(clientId_);
  endTime += 10 * 1000ul * 1000ul;
  LOG(INFO) << "Expected to end at " << endTime;
  // Poisson rate is ``10ms as one unit''
  for (uint32_t i = 0; i < clientConfig_.durationSec * 100; i++) {
    if (!running_) {
      return;
    }
    if (GetMicrosecondTimestamp() >= endTime) {
      // Client has executed long enough, should terminate
      LOG(INFO) << "Terminating soon...";
      running_ = false;
      return;
    }
    uint32_t reqNum = poissonTrace_[i % poissonTrace_.size()];
    if (reqNum <= 0) {
      usleep(10000);
      continue;
    }
    uint32_t intval = 10000 / reqNum;
    uint64_t startTime = GetMicrosecondTimestamp();
    for (uint32_t j = 0; j < reqNum; j++) {
      while (GetMicrosecondTimestamp() < startTime + j * intval) {
      }
      // Send the request
      uint32_t mapIdx =
          roundRobinIdx % (proxyAddrs_.size() * proxyAddrs_[0].size());
      Request* request = NULL;
      if (retryQu_.try_dequeue(request)) {
        // Retry this request
        Address* roundRobinAddr = proxyAddrs_[mapIdx % proxyAddrs_.size()]
                                             [mapIdx / proxyAddrs_.size()];
        // LOG(INFO) << "Resend " << request->reqid() << "to "
        //           << mapIdx % proxyAddrs_.size() << "\t"
        //           << mapIdx / proxyAddrs_.size();
        requestEP_->SendMsgTo(*roundRobinAddr, *request,
                              MessageType::CLIENT_REQUEST);
        outstandingRequestSendTime_.assign(request->reqid(),
                                           GetMicrosecondTimestamp());
        roundRobinIdx++;
      } else {
        // submit new requests
        request = new Request();
        request->set_clientid(clientId_);
        request->set_reqid(nextReqId_);
        if (random() % 100 < 100 * writeRatio_) {
          request->set_iswrite(true);
        } else {
          request->set_iswrite(false);
        }

        request->set_key(zipfianKeys_[nextReqId_ % zipfianKeys_.size()]);
        // // if (nextReqId_ % 10 == 1 && clientId_ <= 10) {
        // if (clientId_ <= 12) {
        //   if (nextReqId_ % 2 == 1)
        //     request->set_iswrite(true);
        //   else
        //     request->set_iswrite(false);

        //   // request->set_iswrite(true);
        //   // LOG(INFO) << "One Write " << request->key()
        //   //           << " reqId=" << request->reqid();
        // } else {
        //   exit(0);
        // }

        // request->set_key(nextReqId_ % 100000 + 100000 * (clientId_ - 1));
        Address* roundRobinAddr = proxyAddrs_[mapIdx % proxyAddrs_.size()]
                                             [mapIdx / proxyAddrs_.size()];
        // LOG(INFO) << "Sed " << request->reqid() << "to "
        //           << mapIdx % proxyAddrs_.size() << "\t"
        //           << mapIdx / proxyAddrs_.size();
        requestEP_->SendMsgTo(*roundRobinAddr, *request,
                              MessageType::CLIENT_REQUEST);
        outstandingRequests_.assign(request->reqid(), request);
        outstandingRequestSendTime_.assign(request->reqid(),
                                           GetMicrosecondTimestamp());
        nextReqId_++;
        roundRobinIdx++;
      }
    }
  }

  LOG(INFO) << "Terminating soon... after "
            << (endTime - GetMicrosecondTimestamp()) * 1e-6 << " seconds";
  while (GetMicrosecondTimestamp() < endTime) {
    // Client has executed long enough, should terminate
    usleep(1000);
  }
  running_ = false;
}

void Client::CloseLoopSubmissionTd() {
  int roundRobinIdx = 0;
  uint64_t startTime = GetMicrosecondTimestamp();
  uint64_t endTime = startTime + clientConfig_.durationSec * 1000000;
  endTime += 10 * 1000ul * 1000ul;
  LOG(INFO) << "Expected to end at " << endTime;
  srand(clientId_);
  while (running_) {
    if (GetMicrosecondTimestamp() >= endTime) {
      // Client has executed long enough, should terminate
      LOG(INFO) << "Terminating soon...";
      running_ = false;
      return;
    }
    Request* request = NULL;
    uint32_t mapIdx =
        roundRobinIdx % (proxyAddrs_.size() * proxyAddrs_[0].size());
    if (nextReqId_ == committedReqId_ + 1) {
      // submit new request
      request = new Request();
      request->set_clientid(clientId_);
      request->set_reqid(nextReqId_);
      if (random() % 100 < 100 * writeRatio_) {
        request->set_iswrite(true);
      } else {
        request->set_iswrite(false);
      }
      request->set_key(zipfianKeys_[nextReqId_ % zipfianKeys_.size()]);
      Address* roundRobinAddr =
          proxyAddrs_[mapIdx % proxyAddrs_.size()][mapIdx / proxyAddrs_.size()];
      requestEP_->SendMsgTo(*roundRobinAddr, *request,
                            MessageType::CLIENT_REQUEST);
      outstandingRequests_.assign(request->reqid(), request);
      outstandingRequestSendTime_.assign(request->reqid(),
                                         GetMicrosecondTimestamp());
      nextReqId_++;
      roundRobinIdx++;
    } else {
      if (retryQu_.try_dequeue(request)) {
        // have some requests to retry
        Address* roundRobinAddr = proxyAddrs_[mapIdx % proxyAddrs_.size()]
                                             [mapIdx / proxyAddrs_.size()];
        requestEP_->SendMsgTo(*roundRobinAddr, *request,
                              MessageType::CLIENT_REQUEST);
        outstandingRequestSendTime_.assign(request->reqid(),
                                           GetMicrosecondTimestamp());
        roundRobinIdx++;
      }
    }
  }
  LOG(INFO) << "Terminating soon... after "
            << (endTime - GetMicrosecondTimestamp()) * 1e-6 << " seconds";
  while (GetMicrosecondTimestamp() < endTime) {
    // Client has executed long enough, should terminate
    usleep(1000);
  }
  running_ = false;
}

void Client::LogTd() {
  LogInfo* log = NULL;
  uint64_t startTime, endTime;
  uint32_t lastSubmitteddReqId = 0;
  uint32_t lastCountCommitedReq = 0;
  uint32_t latencySample = 0;

  std::ofstream ofs("Client-Stats-" + std::to_string(clientId_));
  ofs << "ReqId,SendTime,CommitTime,CommitType" << std::endl;

  startTime = GetMicrosecondTimestamp();
  while (running_) {
    endTime = GetMicrosecondTimestamp();
    if (endTime - startTime >= 5000000) {
      float duration = (endTime - startTime) * 1e-6;
      uint32_t submittedReqNum = nextReqId_ - 1 - lastSubmitteddReqId;
      uint32_t committedReqNum = committedNum_ - lastCountCommitedReq;
      float submissionRate = submittedReqNum / duration;
      float commitRate = committedReqNum / duration;
      lastSubmitteddReqId = nextReqId_ - 1;
      lastCountCommitedReq = committedNum_;
      startTime = endTime;
      LOG(INFO) << "endTime=" << endTime << "\t"
                << "committedNum_ = " << committedNum_ << "\t"
                << "logQuLen =" << logQu_.size_approx() << "\t"
                << "committedReqId_=" << committedReqId_ << "\t"
                << "nextReqId_=" << nextReqId_ << "\t"
                << "lastCommittedReqId_=" << lastCommittedReqId_ << "\t"
                << "submissionRate=" << submissionRate << " req/sec\t"
                << "commitRate=" << commitRate << " req/sec"
                << "\t"
                << "FastCommitRatio=" << fastCommitNum_ * 100.0 / committedNum_
                << "\t"
                << "latency(Sample)=" << latencySample << " us"
                << "\t"
                << "retryNum=" << retryNumber_;

      ofs.flush();
    }
    if (logQu_.try_dequeue(log)) {
      // LOG(INFO) << "committedReqId_=" << committedReqId_ << "\t" << "reqId="
      // << log->reqId;
      while (committedReqId_ + 1 <= log->reqId) {
        if (outstandingRequestSendTime_.get(committedReqId_ + 1) == 0) {
          // this reqId has also been committed (i.e. cannot find its footprint)
          // advance committedReqId;
          committedReqId_++;
        } else {
          break;
        }
      }

      latencySample = log->commitTime - log->sendTime;

      // log stats
      ofs << log->toString() << std::endl;
      delete log;
    }

    // // Check whether any requests need retry
    // for (uint32_t reqId = committedReqId_ + 1; reqId < nextReqId_; reqId++) {
    //   uint64_t sendTime = outstandingRequestSendTime_.get(reqId);
    //   if (sendTime > 0) {
    //     // Find it
    //     if (GetMicrosecondTimestamp() - sendTime > retryTimeoutus_) {
    //       // timeout, should retry
    //       Request* request = outstandingRequests_.get(reqId);
    //       LOG(INFO) << "Timeout Retry " << request->reqid();
    //       outstandingRequestSendTime_.erase(reqId);
    //       retryQu_.enqueue(request);
    //       retryNumber_++;
    //     }
    //   }
    // }

    while (reclaimedReqId_ + 1000 < committedReqId_) {
      // do not reclaim request too aggressive
      // If we reclaim too aggressive, there can be some edge case of dangling
      // request pointer
      Request* request = outstandingRequests_.get(reclaimedReqId_);
      if (request) {
        outstandingRequests_.erase(request->reqid());
        delete request;
      }
      reclaimedReqId_++;
    }
  }
  LOG(INFO) << "The runtime have been terminated, we still need to dump "
            << logQu_.size_approx() << " Logs before exit";

  uint32_t cnt = 0;
  while (logQu_.try_dequeue(log)) {
    // log stats
    ofs << log->toString() << std::endl;
    delete log;
    cnt++;
    if (cnt % 10000 == 0) {
      LOG(INFO) << "Remaining Log Number " << logQu_.size_approx();
      ofs.flush();
    }
  }
  ofs.flush();
  LOG(INFO) << "Dump Finished";
}

void Client::Terminate() {
  LOG(INFO) << "Terminating...";
  running_ = false;
}

Client::~Client() {
  for (auto& kv : threadPool_) {
    delete kv.second;
  }
  while (reclaimedReqId_ <= nextReqId_) {
    Request* request = outstandingRequests_.get(reclaimedReqId_);
    if (request) {
      outstandingRequests_.erase(request->reqid());
      delete request;
    }
    reclaimedReqId_++;
  }
}
}  // namespace nezha

================================================
FILE: client/client.h
================================================
#include <yaml-cpp/yaml.h>
#include <fstream>
#include <iostream>
#include "client_config.h"
#include "lib/utils.h"
#include "lib/zipfian.h"
#include "proto/nezha_proto.pb.h"

namespace nezha {
using namespace nezha::proto;
/** LogInfo is used to dump some performance stats, which can be extended to
 * include more metrics */
struct LogInfo {
  uint32_t reqId;
  uint64_t sendTime;
  uint64_t commitTime;
  uint32_t commitType;
  std::string toString() {
    std::string ret =
        (std::to_string(reqId) + "," + std::to_string(sendTime) + "," +
         std::to_string(commitTime) + "," + std::to_string(commitType));
    return ret;
  }
};

/**
 * Refer to client_run.cc, the runnable program only needs to instantiate a
 * client object with a configuration file. Then it calls Run() method to run
 * and calls Terminate() method to stop
 */
class Client {
 private:
  /** All the configuration parameters for client are included in
   * clientConfig_*/
  ClientConfig clientConfig_;
  /** Each thread is given a unique name (key) and stored in the pool */
  std::map<std::string, std::thread*> threadPool_;
  /** The endpoint uses to submit request to proxies */
  Endpoint* requestEP_;

  /** The message handler used to handle replies (from proxies) */
  struct MessageHandler* replyHandler_;
  /** The timer periodically monitor the status of the client, and break the
   * blocking endpoint when the client is about to terminate */
  struct Timer* monitorTimer_;

  /** Flag to Run/Terminate threads */
  std::atomic<bool> running_;

  /** Each client is assigned with a unqiue id */
  int clientId_;

  /** Open-Loop submission related: the client's submission rate follows a
   * poisson distribution. We use 10ms as the basic interval and generate random
   * numbers with reference to poissonRate_, stored in poissonTrace_. Then the
   * open-loop clients submit poissonTrace_[i] requests in the ith interval.
   *
   * Regarding the definition of open-loop and closed-loop submission, refer to
   * ``evaluation method`` para of Sec 7.1 in our paper
   * */
  int poissonRate_;

  /** The next requestId to be submitted */
  std::atomic<uint32_t> nextReqId_;

  /** Requests whose requestId less or equal to committedReqId_ have been
   * committed */
  std::atomic<uint32_t> committedReqId_;

  /** Requests whose requestId less or equal to reclaimedReqId_ have been
   * reclaimed (memory freed) */
  std::atomic<uint32_t> reclaimedReqId_;
  std::vector<uint32_t> poissonTrace_;

  /** To communicate between OpenLoopSubmissionTd/CloseLoopSubmissionTd and
   * LogTd The LogTd monitors the outstanding requests (i.e. which have been
   * submitted but have not been committed). If some request has not been
   * committed after a certain time, the LogTd will enqueue the request to
   * retryQu, so that the OpenLoopSubmissionTd/CloseLoopSubmissionTd will
   * retry them */
  ConcurrentQueue<Request*> retryQu_;

  /** The addresses of proxies. Since we can have multiple proxies, and each
   * proxies can have multiple shards, we use a two-dimensional vector to store
   * the addresses, i.e., proxyAddrs[i][j] indicates the address of the jth
   * shard of the ith proxy */
  std::vector<std::vector<Address*>> proxyAddrs_;

  /** To test commutativity, we generate different zipfian workloads and write
   * ratios, i.e., we generate random numbers following the zipfian
   * distribution. These random numbers are stored in zipfianKeys_ and serve as
   * the keys that will be written/read by requests */
  std::vector<uint32_t> zipfianKeys_;

  float writeRatio_;

  /** Those requests which have been submitted but not yet committed (key is the
   * requestId)*/
  ConcurrentMap<uint32_t, Request*> outstandingRequests_;

  /** Record the send time of the requests, together with retryTimeoutus_, to
   * decide whether the request needes to be retried*/
  ConcurrentMap<uint32_t, uint64_t> outstandingRequestSendTime_;

  /** Used by LogTd to monitor outstanding reuqests. If they cannot be committed
   * within retryTimeoutUs_ (measured in macro-seconds), they should be retried
   * **/
  uint32_t retryTimeoutUs_;

  /** To communicate between ProcessReplyTd and LogTd */
  ConcurrentQueue<LogInfo*> logQu_;

  /** Performance counters, to show how many requests are retried/committed */
  uint32_t retryNumber_;
  uint32_t committedNum_;
  uint32_t fastCommitNum_;
  uint32_t fastWriteNum_;

  /** Stats */
  std::vector<uint32_t> hop3s;
  std::vector<uint32_t> hop4s;
  std::vector<uint32_t> totals;

  /** Launch all the threads, only called once during the lifetime of the
   * client*/
  void LaunchThreads();

  /** Functions whose names are ended with ``Td`` will be used to instantiate
   * threads.
   *
   * For the client, there are mainly three worker threads running:
   *
   * (1) OpenLoopSubmissionTd/CloseLoopSubmissionTd submits requests. A client
   * can be either open-loop client or closed-loop client, but cannot be both.
   *
   * (2) ProcessReplyTd receives and processes the reply messages, and handle
   * the log information to LogTd
   *
   * (3) LogTd dumps logs and also monitors the oustanding requests. If the
   * requests have not been committed after a certain time (retryTimeoutus_),
   * then LogTd will ask OpenLoopSubmissionTd/CloseLoopSubmissionTd to resubmit
   * this reuqest to proxies
   * */
  void ProcessReplyTd();
  void OpenLoopSubmissionTd();
  void CloseLoopSubmissionTd();
  void LogTd();

  /** The message handler to handle messages from proxies. The function is used
   * to instantiate a replyHandler_ and registered to requestEP_ */
  void ReceiveReply(MessageHeader* msgHdr, char* msgBuffer, Address* sender);

 public:
  /** Client accepts a config file, which contains all the necessary information
   * to instantiate the object, then it can call Run method
   *  */
  Client(const std::string& configFile = "../configs/nezha-client-config.yaml");
  void Run();
  void Terminate();
  ~Client();

  /** For debug */
  uint64_t lastCommittedReqId_;
  std::vector<uint32_t> ls;
};

}  // namespace nezha

================================================
FILE: client/client_config.h
================================================
#include <glog/logging.h>
#include <stdint.h>
#include <yaml-cpp/yaml.h>
#include <string>
#include <vector>

struct ClientConfig {
  int clientId;
  std::string clientIp;
  int endpointType;
  int requestPort;
  uint32_t proxyMaxOwd;
  int proxyReplyPortBase;
  bool isOpenLoop;
  int poissonRate;
  uint32_t durationSec;
  int keyNum;
  double skewFactor;
  double writeRatio;
  int requestRetryTimeUs;

  int proxyRequestPortBase;
  std::vector<std::string> proxyIps;
  int proxyShardNum;

  // Parses yaml file configFilename and fills in fields of ProxyConfig
  // accordingly. Returns an error message or "" if there are no errors.
  std::string parseConfig(std::string configFilename) {
    YAML::Node config;
    try {
      config = YAML::LoadFile(configFilename);
    } catch (const YAML::BadFile& e) {
      return "Error loading config file:" + e.msg + ".";
    }
    LOG(INFO) << "Using config:\n " << config;

    std::string key;  // Keep track of current key for better error messages
    try {
      key = "client-id";
      clientId = config[key].as<int>();
      key = "client-ip";
      clientIp = config[key].as<std::string>();
      key = "endpoint-type";
      endpointType = config[key].as<int>();
      key = "request-port";
      requestPort = config[key].as<int>();
      key = "is-openloop";
      isOpenLoop = config[key].as<bool>();
      key = "poisson-rate";
      poissonRate = config[key].as<int>();
      key = "duration-sec";
      durationSec = config[key].as<uint32_t>();
      key = "key-num";
      keyNum = config[key].as<int>();
      key = "skew-factor";
      skewFactor = config[key].as<double>();
      key = "write-ratio";
      writeRatio = config[key].as<double>();
      key = "request-retry-time-us";
      requestRetryTimeUs = config[key].as<int>();

      key = "proxy-ips";
      for (uint32_t i = 0; i < config[key].size(); i++) {
        proxyIps.push_back(config[key][i].as<std::string>());
      }
      key = "proxy-shards";
      proxyShardNum = config[key].as<int>();
      key = "proxy-request-port-base";
      proxyRequestPortBase = config[key].as<int>();

      return "";
    } catch (const YAML::BadConversion& e) {
      if (config[key]) {
        return "Error parsing config field " + key + ": " + e.msg + ".";
      } else {
        return "Error parsing config field " + key + ": key not found.";
      }
    } catch (const std::exception& e) {
      return "Error parsing config field " + key + ": " + e.what() + ".";
    }
  }
};

================================================
FILE: client/client_run.cc
================================================
#include "client/client.h"
DEFINE_string(config, "nezhav2/config/nezha-client-config-0.yaml", "The config file for the client");
nezha::Client* client = NULL;
void Terminate(int para) {
    client->Terminate();
}
int main(int argc, char* argv[]) {
    gflags::ParseCommandLineFlags(&argc, &argv, true);
    google::InitGoogleLogging(argv[0]);
    FLAGS_logtostderr = 1;
    signal(SIGINT, Terminate);
    client = new nezha::Client(FLAGS_config);
    client->Run();
    delete client;
}

================================================
FILE: configs/dist/nezha-client-config.yaml
================================================
---
print-config: true
proxy-info:
  proxy-ips:
    - "10.128.2.13"
  proxy-shards: 1
  request-port-base: 32000
client-info:
  client-id: 1
  client-ip: "10.128.2.14"
  request-port: 32912
  is-openloop: true
  poisson-rate: 10 # it means the client sends x reqs/10ms on average
  duration-sec: 60 # it means the duration of the client runs (second)
  key-num: 1000000 # when key-num is 1, it means there is no commutativity optimization
  skew-factor: 0.5 # 0-0.99 The higher the zipfian factor is, the more skewed the workload is
  request-retry-time-us: 10000 # After the request is submitted, if we cannot get the response after such long time, then we will retry


================================================
FILE: configs/dist/nezha-proxy-config.yaml
================================================
---
print-config: true
# Replica Info
replica-info:
  replica-ips:
    - "10.128.2.10"
    - "10.128.2.11"
    - "10.128.2.12"
  receiver-shards: 1 # The number of threads to receive threads
  receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
  initial-owd: 80 # The initial one-way delay (us) between replicas and proxies
# Proxy Info
proxy-info:
  proxy-id: 1
  proxy-ip: "10.128.2.13"
  shard-num: 1
  request-port-base: 32000
  reply-port-base: 33000


================================================
FILE: configs/dist/nezha-replica-config-0.yaml
================================================
---
print-config: true
replica-ips:
  - "10.128.2.10"
  - "10.128.2.11"
  - "10.128.2.12"
replica-id: 0
receiver-shards: 1 # The number of threads to receive threads
process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance
reply-shards: 1 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-max-batch: 30
request-transfer-max-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # when commutativity is enabled for test, we need the specified key number to detect commutativity
owd-estimation-window: 1000 # we use a sliding window to get moving median of one-way delays, here is the length of the window


================================================
FILE: configs/dist/nezha-replica-config-1.yaml
================================================
---
print-config: true
replica-ips:
  - "10.128.2.10"
  - "10.128.2.11"
  - "10.128.2.12"
replica-id: 1
receiver-shards: 1 # The number of threads to receive threads
process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance
reply-shards: 1 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-max-batch: 30
request-transfer-max-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # when commutativity is enabled for test, we need the specified key number to detect commutativity
owd-estimation-window: 1000 # we use a sliding window to get moving median of one-way delays, here is the length of the window


================================================
FILE: configs/dist/nezha-replica-config-2.yaml
================================================
---
print-config: true
replica-ips:
  - "10.128.2.10"
  - "10.128.2.11"
  - "10.128.2.12"
replica-id: 2
receiver-shards: 1 # The number of threads to receive threads
process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance
reply-shards: 1 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-max-batch: 30
request-transfer-max-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # when commutativity is enabled for test, we need the specified key number to detect commutativity
owd-estimation-window: 1000 # we use a sliding window to get moving median of one-way delays, here is the length of the window


================================================
FILE: configs/dist/nezha-replica-config.yaml
================================================
---
print-config: true
replica-ips:
  - "10.128.2.10"
  - "10.128.2.11"
  - "10.128.2.12"
replica-id: 0
receiver-shards: 1 # The number of threads to receive threads
process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance
reply-shards: 1 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-max-batch: 30
request-transfer-max-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # when commutativity is enabled for test, we need the specified key number to detect commutativity
owd-estimation-window: 1000 # we use a sliding window to get moving median of one-way delays, here is the length of the window


================================================
FILE: configs/local/nezha-client-config.yaml
================================================
---
client-id: 1
client-ip: "127.0.0.5"
endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
request-port: 32912
is-openloop: true
poisson-rate: 1 # it means the client sends x reqs/10ms on average (should be larger than 10, otherwise, the submission rate is not accurate)
duration-sec: 60 # it means the duration of the client runs (second)
key-num: 1000000 # when key-num is 1, it means there is no commutativity optimization
skew-factor: 0.5 # 0-0.99 The higher the zipfian factor is, the more skewed the workload is
write-ratio: 0.5 # 0-1, the ratio of write requests
request-retry-time-us: 100000 # After the request is submitted, if we cannot get the response after such long time, then we will retry

# proxy info
proxy-ips:
  - "127.0.0.4"
proxy-shards: 1
proxy-request-port-base: 32000


================================================
FILE: configs/local/nezha-proxy-config.yaml
================================================
---
# Proxy Info
proxy-endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
proxy-id: 1
proxy-ip: "127.0.0.4"
proxy-shard-num: 1
proxy-max-owd: 200
proxy-request-port-base: 32000
proxy-reply-port-base: 33000

# Replica Info
replica-ips:
  - "127.0.0.1"
  - "127.0.0.2"
  - "127.0.0.3"
replica-receiver-shards: 1 # The number of threads to receive threads
replica-receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
replica-initial-owd: 80 # The initial one-way delay (us) between replicas and proxies



================================================
FILE: configs/local/nezha-replica-config-0.yaml
================================================
---
print-config: true
replica-ips:
  - "127.0.0.1"
  - "127.0.0.2"
  - "127.0.0.3"
endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
replica-id: 0
receiver-shards: 1 # The number of threads to receive threads
record-shards: 1 # The number of threads to record requests in the global concurrent map
track-shards: 1 # The number of threads to record synced log entries
reply-shards: 2 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
index-sync-period-us: 50 # For every period, the leader replica will send a index sync message to followers, serves for slow path and heartbeat
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-batch: 30
request-key-transfer-batch: 60
request-transfer-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # When commutativity is enabled for test, we need the specified key number to detect commutativity
moving-percentile: 0.50 # the percentile used to estimate owd
owd-estimation-window: 1000 # We use a sliding window to get moving median of one-way delays, here is the length of the window
reclaim-timeout-ms: 5000 # To save memory, those requests who enter the late buffer will not stay there forever: if they have stayed for so long, they will be reclaimed. Similary, for unsynced log entries, if (1) they have been kept for more than reclaim-timeout-ms (2) they are not used by the worker threads; then they will be reclaimed


================================================
FILE: configs/local/nezha-replica-config-1.yaml
================================================
---
print-config: true
replica-ips:
  - "127.0.0.1"
  - "127.0.0.2"
  - "127.0.0.3"
endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
replica-id: 1
receiver-shards: 1 # The number of threads to receive threads
record-shards: 1 # The number of threads to record requests in the global concurrent map
track-shards: 1 # The number of threads to record synced log entries
reply-shards: 2 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
index-sync-period-us: 50 # For every period, the leader replica will send a index sync message to followers, serves for slow path and heartbeat
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-batch: 30
request-key-transfer-batch: 60
request-transfer-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # When commutativity is enabled for test, we need the specified key number to detect commutativity
moving-percentile: 0.50 # the percentile used to estimate owd
owd-estimation-window: 1000 # We use a sliding window to get moving median of one-way delays, here is the length of the window
reclaim-timeout-ms: 5000 # To save memory, those requests who enter the late buffer will not stay there forever: if they have stayed for so long, they will be reclaimed. Similary, for unsynced log entries, if (1) they have been kept for more than reclaim-timeout-ms (2) they are not used by the worker threads; then they will be reclaimed


================================================
FILE: configs/local/nezha-replica-config-2.yaml
================================================
---
print-config: true
replica-ips:
  - "127.0.0.1"
  - "127.0.0.2"
  - "127.0.0.3"
endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
replica-id: 2
receiver-shards: 1 # The number of threads to receive threads
record-shards: 1 # The number of threads to record requests in the global concurrent map
track-shards: 1 # The number of threads to record synced log entries
reply-shards: 2 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
index-sync-period-us: 50 # For every period, the leader replica will send a index sync message to followers, serves for slow path and heartbeat
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-batch: 30
request-key-transfer-batch: 60
request-transfer-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # When commutativity is enabled for test, we need the specified key number to detect commutativity
moving-percentile: 0.50 # the percentile used to estimate owd
owd-estimation-window: 1000 # We use a sliding window to get moving median of one-way delays, here is the length of the window
reclaim-timeout-ms: 5000 # To save memory, those requests who enter the late buffer will not stay there forever: if they have stayed for so long, they will be reclaimed. Similary, for unsynced log entries, if (1) they have been kept for more than reclaim-timeout-ms (2) they are not used by the worker threads; then they will be reclaimed


================================================
FILE: configs/nezha-client-config-template.yaml
================================================
---
print-config: true
proxy-info:
  proxy-ips:
    - "127.0.0.4"
  proxy-shards: 12
  request-port-base: 32000
client-info:
  endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
  client-id: 1
  client-ip: "127.0.0.5"
  request-port: 32912
  is-openloop: true
  poisson-rate: 60 # it means the client sends x reqs/10ms on average (should be larger than 10, otherwise, the submission rate is not accurate)
  duration-sec: 60 # it means the duration of the client runs (second)
  key-num: 1000000 # when key-num is 1, it means there is no commutativity optimization
  skew-factor: 0.5 # 0-0.99 The higher the zipfian factor is, the more skewed the workload is
  write-ratio: 0.5 # 0-1, the ratio of write requests
  request-retry-time-us: 100000 # After the request is submitted, if we cannot get the response after such long time, then we will retry


================================================
FILE: configs/nezha-proxy-config-template.yaml
================================================
---
print-config: true
# Replica Info
replica-info:
  replica-ips:
    - "127.0.0.1"
    - "127.0.0.2"
    - "127.0.0.3"
  receiver-shards: 2 # The number of threads to receive threads
  receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
  initial-owd: 80 # The initial one-way delay (us) between replicas and proxies
# Proxy Info
proxy-info:
  endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
  proxy-id: 1
  proxy-ip: "127.0.0.4"
  shard-num: 12
  max-owd: 200
  request-port-base: 32000
  reply-port-base: 33000


================================================
FILE: configs/nezha-replica-config-template.yaml
================================================
---
print-config: true
replica-ips:
  - "127.0.0.1"
  - "127.0.0.2"
  - "127.0.0.3"
endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
replica-id: 0
receiver-shards: 2 # The number of threads to receive threads
record-shards: 1 # The number of threads to record requests in the global concurrent map
track-shards: 1 # The number of threads to record synced log entries
process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance
reply-shards: 3 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
index-sync-period-us: 50 # For every period, the leader replica will send a index sync message to followers, serves for slow path and heartbeat
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-batch: 30
request-key-transfer-batch: 60
request-transfer-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # When commutativity is enabled for test, we need the specified key number to detect commutativity
moving_percentile: 0.90 # the percentile used to estimate owd
owd-estimation-window: 1000 # We use a sliding window to get moving median of one-way delays, here is the length of the window
reclaim-timeout-ms: 5000 # To save memory, those requests who enter the late buffer will not stay there forever: if they have stayed for so long, they will be reclaimed. Similary, for unsynced log entries, if (1) they have been kept for more than reclaim-timeout-ms (2) they are not used by the worker threads; then they will be reclaimed


================================================
FILE: docs/Nezha.tla
================================================

`^\textbf{\large N  TLA+ Specification}\\^' 
 
------------------------------ MODULE Nezha ----------------------------------

EXTENDS Naturals, TLC, FiniteSets, Sequences

--------------------------------------------------------------------------------
(* `^\textbf{\large Bounds for Model Check [Configurable]}^' *)

\* Time Range [Configurable]
MaxTime == 3

\* Each client is only allowed to submit MaxReqNum requests [Configurable]
\* In the specification, we will only consider two roles, client and replicas
\* (i.e. it can be considered as co-locating one proxy with one client)
\* For the proxy-based design, we just need to replace client with proxy, 
\* and then the specification describes the interaction between proxy and replicas
MaxReqNum == 1 

\* The leader is only allowed to crash when the view < MaxViews [Configurable]
MaxViews == 3

\* These variables are used to implment at-most-once primitives
\* i.e. The variables record the messages processed by Replicas/Clients, so 
\* that the Replicas/Clients will not process twice
VARIABLE  vReplicaProcessed, \* Messages that have been processed by replicas
          vClientProcessed \* Messages that have been processed by clients

VARIABLE DebugAction

(* `^\textbf{\large Constants}^' *)

\* The set of replicas and an ordering of them
CONSTANTS Replicas, ReplicaOrder, Clients, LatencyBounds
ASSUME IsFiniteSet(Replicas) 
ASSUME ReplicaOrder \in Seq(Replicas)


F == (Cardinality(Replicas) - 1) \div 2
ceilHalfF == IF (F \div 2) * 2 = F THEN F \div 2 ELSE (F+1) \div 2
floorHalfF == F \div 2
QuorumSize == F + 1
FastQuorumSize == F + ceilHalfF + 1
RecoveryQuorumSize == ceilHalfF + 1
FastQuorums == {R \in SUBSET(Replicas) : Cardinality(R) >= FastQuorumSize }
Quorums == {R \in SUBSET(Replicas) : Cardinality(R) * 2 > Cardinality(Replicas)}   

\* Replica Statuses
StNormal == 1
StViewChange == 2
StRecovering == 3

\* Message Types
MClientRequest == 1 \* Sent by client to replicas
MFastReply == 2 \* Fast Reply Message
MSlowReply == 3 \* Slow Reply Message
MLogIndex == 4  \* LogIndex
MLogEntry == 5  \* Log entry, different from index, it includes command field, which can be large in practice
MIndexSync == 6 \* Sync message during the index sync process
MMissEntryRequest == 7 \* Sent by followers once they fail to find the entry on itself
MMissEntryReply == 8  \* Response to MMissEntryRequest, providing the missing entries

MViewChangeReq == 9       \* Sent when leader/sequencer failure detected
MViewChange == 10        \* Sent to ACK view change
MStartView == 11           \* Sent by new leader to start view

\* The following messages are mainly used for periodic sync
\* Just as described in NOPaxos, it is an optional optimization to enable fast recovery after failure
MSyncPrepare == 12         \* Sent by the leader to ensure log durability
MSyncRep == 13             \* Sent by followers as ACK
MSyncCommit == 14           \* Sent by leaders to indicate stable log

\* The following messages are mainly used for replica recovery
MCrashVectorReq == 15
MCrashVectorRep == 16
MRecoveryReq == 17
MRecoveryRep == 18
MStateTransferReq == 19
MStateTransferRep == 20
      

(*
  `^\textbf{Message Schemas}^'

  ViewIDs == [ leaderNum |-> n \in (1..) ]

  \* <clientID, requestID> uniquely identifies one request on one replica
  \* But across replicas, the same <clientID, requestID> may have different deadlines
  \* (the leader may modify the deadline to make the request eligible to enter the early-buffer)
  \* so <deadline, clientID, reqID> uniquely identifes one request across replicas 

  ClientRequest
      [ mtype       |-> MClientRequest,
        sender      |-> c \in Clients,
        dest        |-> r \in Replicas,
        requestID   |-> i \in (1..), 
        command     |-> "", 
        s           |-> t \in (1..MaxTime), 
        l           |-> l \in (1..MaxBound)
      ]
  
  \* logSlotNum is not necessary and it is not described in the paper
  \* Here we include logSlotNum in FastReply and SlowReply messages
  \* to facilitate the check of Linearizability invariant
  FastReply
      [ mtype      |-> MFastReply,
        sender     |-> r \in Replicas,
        dest       |-> c \in Clients,
        viewID     |-> v \in ViewIDs,
        requestID  |-> i \in (1..vClientReqNum)
        hash       |-> [
                        log |-> vLogs[1..n], 
                        cv |-> crashVector
                       ] 
        deadline   |-> i \in (1..MaxTime+MaxBound),
        logSlotNum |-> n \in (1..)
      ]

  SlowReply
      [ mtype      |-> MSlowReply,
        sender     |-> r \in Replicas,
        dest       |-> c \in Clients,
        viewID     |-> v \in ViewIDs,
        requestID  |-> i \in (1..vClientReqNum)
        logSlotNum |-> n \in (1..)
      ]
      
  LogIndex
      [ mtype      |-> MLogIndex,
        clientID   |-> c \in Clients,
        requestID  |-> i \in (1..vClientReqNum),
        deadline   |-> i \in (1..MaxTime+MaxBound),
      ]
      
  LogEntry
      [ mtype      |-> MLogEntry,
        clientID   |-> c \in Clients,
        requestID  |-> i \in (1..vClientReqNum),
        deadline   |-> i \in (1..MaxTime+MaxBound),
        command    |-> ""
      ]
      
  IndexSync
      [ mtype      |-> MIndexSync,
        sender     |-> r \in Replicas,
        dest       |-> c \in Clients,
        viewID     |-> v \in ViewIDs,
        logindcies |-> index \in vLogs[leaderIdx]
      ]

   MMissEntryRequest
      [ mtype      |-> MMissEntryRequest,
        sender     |-> r \in Replicas,
        dest       |-> d \in Replicas,
        viewID     |-> v \in ViewIDs,
        miss       |-> {log indices}
      ]

   MMissEntryRequest
      [ mtype      |-> MMissEntryReply,
        sender     |-> r \in Replicas,
        dest       |-> d \in Replicas,
        viewID     |-> v \in ViewIDs,
        entries    |-> {log entries}
      ]
      
  ViewChangeReq
      [ mtype  |-> MViewChangeReq,
        sender |-> r \in Replicas,
        dest   |-> r \in Replicas,
        viewID |-> v \in ViewIDs,
        cv     |-> crash vector 
      ]

  ViewChange
      [ mtype      |-> MViewChange,
        sender     |-> r \in Replicas,
        dest       |-> r \in Replicas,
        viewID     |-> v \in ViewIDs,
        lastNormal |-> v \in ViewIDs,
        log        |-> l \in vLogs[1..n],
        cv         |-> crash vector  
      ]

  StartView
      [ mtype      |-> MStartView,
        dest       |-> r \in Replicas,
        viewID     |-> v \in ViewIDs,
        log        |-> l \in vLogs[1..n],
        cv         |-> crash vector 
      ]


  SyncPrepare
      [ mtype      |-> MSyncPrepare,
        dest       |-> r \in Replicas,
        sender     |-> r \in Replicas,
        viewID     |-> v \in ViewIDs,
        log        |-> l \in vLogs[1..n] ]

  SyncRep
      [ mtype         |-> MSyncRep,
        dest          |-> r \in Replicas,
        sender        |-> r \in Replicas,
        viewID        |-> v \in ViewIDs,
        logSlotNumber |-> n \in (1..) ]

  SyncCommit
      [ mtype         |-> MSyncCommit,
        dest          |-> r \in Replicas,
        sender        |-> r \in Replicas,
        viewID        |-> v \in ViewIDs,
        log           |-> l \in vLogs[1..n] ]
        
  CrashVectorReq
      [ mtype         |-> MCrashVectorReq,
        sender        |-> r \in Replicas,
        dest          |-> r \in Replicas,
        nonce         |-> nonce
      ] 
  CrashVectorRep
      [ mtype         |-> MCrashVectorRep,
        sender        |-> r \in Replicas,
        dest          |-> r \in Replicas,
        nonce         |-> nonce,
        cv            |-> vector of counters
      ] 
      
  RecoveryReq
      [ mtype         |-> MRecoveryReq,
        sender        |-> r \in Replicas,
        dest          |-> r \in Replicas,
        cv            |-> vector of counters
      ]  
      
  RecoveryRep
      [ mtype         |-> MRecoveryRep,
        sender        |-> r \in Replicas,
        dest          |-> r \in Replicas,
        viewID        |-> v \in ViewIDs,
        cv            |-> vector of counters
      ]           

  StateTransferReq
      [ mtype         |-> MStateTransferReq,
        sender        |-> r \in Replicas,
        dest          |-> r \in Replicas,
        cv            |-> vector of counters
      ]  
  StateTransferRep
      [ mtype         |-> MStateTransferRep,
        sender        |-> r \in Replicas,
        dest          |-> r \in Replicas,
        viewID        |-> v \in ViewIDs,
        log           |-> l \in vLogs[1..n] ],
        cv            |-> vector of counters
      ]  
*)

--------------------------------------------------------------------------------
(* `^\textbf{\large Variables}^' *)

\* `^\textbf{Network State}^'
VARIABLE messages \* Set of all messages sent

networkVars      == << messages >>
InitNetworkState == messages = {}

\* Used as a dummy value
NULLLog == [    deadline        |-> 0, 
                clientID    |-> 0,  
                requestID   |-> 0
           ]


\* `^\textbf{Replica State}^'
VARIABLES vLog,            \* Log of values
          vEarlyBuffer,    \* The early buffer to hold request,
                           \* and release it after clock passes its deadline (s+l)
          vReplicaStatus,  \* One of StNormal, StViewChange, StRecovering
          vViewID,         \* Current viewID replicas recognize
          vReplicaClock,   \* Current Time of the replica
          vLastNormView,   \* Last views in which replicas had status StNormal
          vViewChanges,    \* Used for logging view change votes
          vSyncPoint,      \* Latest synchronization point, 
                           \* to which the replica state (vLog) is consistent with the leader.
          vLateBuffer,     \* The late buffer Used to store the requests 
                           \* which are not eligible to enter vEarlyBuffer 
          
          vTentativeSync,  \* Used by leader to mark current syncPrepare point (during periodic sync process)
                           \* (Actually, vSyncPoint and vTentativeSync can be merged into one Var
                           \* However, we decouple them to make the spec easy to understand)
          vSyncReps,       \* Used for logging sync reps at leader 
          vCommitPoint,    \* Different from vSyncPoint, 
                           \* vCommitPoint indicates that the logs before this point has been replicated to majority
                           \* So followers can safely execute requests (log entries) up to vCommitPoint
                           \* Refer to ``Acceleration of Recovery" para in Sec 6
                           
          vUUIDCounter,    \* Locally unique string (for CrashVectorReq)
          vCrashVector,    \* CrashVector, initialized as all-zero vector
          vCrashVectorReps,\* CrashVectorRep Set
          vRecoveryReps    \* RecoveryRep Set
          
replicaVars      == << vLog, vEarlyBuffer, 
                       vViewID, vReplicaClock,
                       vLastNormView, vViewChanges,vReplicaStatus,
                       vSyncPoint, vLateBuffer,
                       vTentativeSync, vSyncReps, vCommitPoint, 
                       vUUIDCounter, vCrashVector, 
                       vCrashVectorReps, vRecoveryReps>>

InitReplicaState ==
  /\ vLog            = [ r \in Replicas |-> << >> ]
  /\ vEarlyBuffer    = [ r \in Replicas |-> {} ]
  /\ vViewID         = [ r \in Replicas |-> 1 ]  \* 0 should also be okay
  /\ vReplicaClock   = [ r \in Replicas |-> 1 ]
  /\ vLastNormView   = [ r \in Replicas |-> 1 ]
  /\ vViewChanges    = [ r \in Replicas |-> {} ]
  /\ vReplicaStatus  = [ r \in Replicas |-> StNormal ]
  /\ vSyncPoint      = [ r \in Replicas |-> 0 ]
  /\ vLateBuffer     = [ r \in Replicas |-> {} ]
  /\ vTentativeSync  = [ r \in Replicas |-> 0 ]
  /\ vSyncReps       = [ r \in Replicas |-> {} ]
  /\ vCommitPoint    = [ r \in Replicas |-> 0 ]
  /\ vCrashVector    = [ r \in Replicas |-> [ rr \in Replicas |-> 0] ]
  /\ vCrashVectorReps= [ r \in Replicas |-> {} ]
  /\ vRecoveryReps   = [ r \in Replicas |-> {} ]
  /\ vUUIDCounter    = [ c \in Replicas  |-> 0 ]

\* `^\textbf{Client State}^'
VARIABLES   vClientClock,   \* Current Clock Time of the client
            vClientReqNum   \* The number of requests that have been sent by this client

InitClientState  ==
  /\ vClientClock    = [ c \in Clients  |-> 1 ]
  /\ vClientReqNum   = [ c \in Clients  |-> 0 ]

clientVars          == << vClientClock, vClientReqNum >>

\* `^\textbf{Set of all vars}^'
vars == << networkVars, replicaVars, clientVars >>

\*\* `^\textbf{Initial state}^'
Init == /\ InitNetworkState
        /\ InitReplicaState
        /\ InitClientState
        /\ vReplicaProcessed = [ r \in Replicas |-> {} ]
        /\ vClientProcessed = [c \in Clients |-> {}]
        /\ DebugAction = <<"Init", "">>

--------------------------------------------------------------------------------
(* `^\textbf{\large Helpers}^' *)

NumofReplicas(status) == Cardinality({ r \in Replicas: vReplicaStatus[r] = status }) 

DuplicateRep(ReplySet,m) == m.sender \in { mm.sender : mm \in ReplySet } 

Pick(S) == CHOOSE s \in S : TRUE
                              
\* Convert a Set to Sequence
RECURSIVE Set2Seq(_)
Set2Seq(S) == IF Cardinality(S) = 0 THEN <<>>
          ELSE
          LET
            x == CHOOSE x \in S : TRUE
          IN
            <<x>> \o Set2Seq(S \ {x})

\* Convert a Sequence to Set
Seq2Set(seq) ==  { seq[i] : i \in DOMAIN seq }
       
Max(S) == CHOOSE x \in S : \A y \in S : x >= y

Min(S) == CHOOSE x \in S : \A y \in S : x <= y

\* `^\textbf{View ID Helpers}^'
LeaderID(viewID) == (viewID % Len(ReplicaOrder)) + (IF viewID >= Len(ReplicaOrder) THEN 1 ELSE 0)

Leader(viewID) == ReplicaOrder[LeaderID(viewID)]  \* remember <<>> are 1-indexed                             


\* `^\textbf{Log Manipulation Helpers}^'

\* The order of 2 log entries are decided by the tuple <deadline, clientID, requestID>
\* Usually, deadline makes the two entries comparable
\* When 2 different entries have the same deadline, the tie is broken with clientID
\* Further, the tie is broken is requestID 
\* (unnecessary if we only allow client to submit one request at one tick) 
EntryLeq(l1, l2)      == /\ l1.deadline <= l2.deadline
                         /\ l1.clientID <= l2.clientID
                         /\ l1.requestID <= l2.requestID
                         
EntryEq(l1, l2)       == /\ l1.deadline = l2.deadline  
                         /\ l1.clientID = l2.clientID
                         /\ l1.requestID = l2.requestID

EntryLessThan(l1, l2) == /\ EntryLeq(l1, l2)
                         /\ ~(EntryEq(l1, l2))
                            
\* Find entry in one replica's log (<clientID, reqID> can uniquely identify the log entry)
\* We do not check deadline, because the leader may have modified the request's deadline
\* Return 0 when we fail to find it (remember Sequence is 1-indexed in TLA+, so 0 can serve as a dummy value)
FindEntry(clientID, reqID, log) == 
                         LET 
                            entryIndexSet == { i \in 1..Len(log): /\ log[i].clientID = clientID
                                                                  /\ log[i].reqID = reqID }
                          IN
                            IF Cardinality(entryIndexSet) = 0 THEN 
                                0
                            ELSE
                                Pick(entryIndexSet)
                                

SortLogSeq(seq) == SortSeq(seq, LAMBDA x, y: EntryLessThan(x, y) )

\* Given a set of logs, return the sorted log list
GetSortLogSeq(S) == LET
                        seq == Set2Seq(S)
                    IN
                        SortLogSeq(seq)
                        
                            
(* Merge logs, first put all log items together, deduplicated (i.e. UNION them into a set).
   Then, do filtering and only keep those that have appeared in at least
   `^\left \lceil{f/2}\right \rceil +1^' replicas. *)

CountVotes(logll, x) ==  Cardinality({ logSet \in logll : x \in logSet })

MergeUnSyncLogs(unSyncedLogs, lastSyncedLog) == 
        LET 
            unSyncedLogSet == UNION unSyncedLogs
            votedLogSet == {x \in unSyncedLogSet : 
                               /\ EntryLessThan(lastSyncedLog, x)
                               /\ CountVotes(unSyncedLogs, x) >= RecoveryQuorumSize}
        IN
            GetSortLogSeq(votedLogSet)
            
\* `^\textbf{Network Helpers}^'
\* Add a message to the network
Send(ms) == messages' = messages \cup ms

\* Convert the request format to a log format (by summing up s and l to get deadline)
Req2Log(req) == [   mtype       |-> MLogEntry,
                    deadline    |-> req.s + req.l, 
                    clientID    |-> req.sender,
                    requestID   |-> req.requestID,
                    command     |-> req.command
                ]
\* Index does not need to include command field, which is the body of the request/log, and can be very large
GetLogIndex(entry) == [ 
                    mtype       |-> MLogIndex, 
                    deadline    |-> entry.deadline,
                    clientID    |-> entry.clientID,
                    requestID   |-> entry.requestID
                ]

GetLogIndexFromReply(reply) == [
                    mtype       |-> MLogIndex, 
                    deadline    |-> reply.deadline,
                    clientID    |-> reply.dest,
                    requestID   |-> reply.requestID
                ]


IndexEq(index, msg) == /\ index.deadline = msg.deadline
                       /\ index.clientID = msg.clientID
                       /\ index.requestID = msg.requestID

\* Add local time to the message (for easy debug)
Msg2RLog(msg, r) == msg @@ [tl |-> vReplicaClock[r]]


       
LastLog(logList) == IF Len(logList) = 0 THEN NULLLog ELSE  logList[Len(logList)]     

MergeCrashVector(cv1, cv2)== [ r \in Replicas |-> Max({cv1[r], cv2[r]}) ]

CheckCrashVector(m, r) == 
    IF m.cv[m.sender] < vCrashVector[r][m.sender] THEN
        FALSE \* Potential stray message
    ELSE 
        vCrashVector' = [ vCrashVector  EXCEPT ![r] = MergeCrashVector(m.cv, vCrashVector[r])]
                               
FilterStrayMessage(MSet, cv) == {m \in MSet : m.cv[m.sender] >= cv[m.sender] }
    
    
--------------------------------------------------------------------------------
(* `^\textbf{\large Message Handlers and Actions }^' *)

\* `^\textbf{Client action}^'
\* Client c sends a request
\* We assume client can only send one request in one tick of time
\* If time has reached the bound, this client cannot send request any more

ClientSendRequest(c) ==     /\ vClientClock[c] < MaxTime 
                            /\ vClientReqNum[c] < MaxReqNum
                            /\ Send({[ mtype |-> MClientRequest,
                                       sender       |-> c, \* clientID
                                       requestID    |-> vClientReqNum[c] + 1, \* requestID
                                       command      |-> "",
                                       s            |-> vClientClock[c], \* submission time
                                       l            |-> LatencyBounds[c], \* latency bound
                                       dest         |-> r
                                      ]: r \in Replicas })
                            /\ vClientClock' = [ vClientClock EXCEPT ![c] = vClientClock[c] + 1 ]
                            /\ vClientReqNum' = [ vClientReqNum EXCEPT ![c] = vClientReqNum[c] +1 ]
                            /\ UNCHANGED  << replicaVars >>  
                                                    


Duplicate(entry, logSet) == 
  LET
       findSet == {x \in logSet : /\ x.clientID = entry.clientID 
                                  /\ x.requestID = entry.requestID }
  IN
       Cardinality(findSet) > 0
    
\* Replica r receives MClientRequest, m
HandleClientRequest(r, m) ==
  LET
    mlog == Req2Log(m)
  IN
  \* If the request is duplicate, it will no longer be appended to the log
  \* Replicas simply reply the previous execution result of this request 
  \* (we do not model execution in this spec)
  /\ ~Duplicate(mlog, Seq2Set(vLog[r]) \union vEarlyBuffer[r] )
  /\ vReplicaStatus[r] = StNormal
     \* The request can enter the early buffer
  /\ \/ /\ EntryLessThan(LastLog(vLog[r]), mlog)
        /\ vEarlyBuffer' = [ 
                vEarlyBuffer EXCEPT ![r] =vEarlyBuffer[r] \cup { mlog } 
           ]
        /\ UNCHANGED   << networkVars, clientVars, 
                       vLog, vViewID, vReplicaClock,
                       vLastNormView, vViewChanges,vReplicaStatus,
                       vSyncPoint,  vLateBuffer, 
                       vTentativeSync, vSyncReps, vCommitPoint,
                       vUUIDCounter, vCrashVector, 
                       vCrashVectorReps, vRecoveryReps >> 
     \* (1) Followers' early buffers do not accept the request 
     \*     if its deadline is smaller than previously appended (last released) entry,
     \*     so followers directly put the request into the late buffer
     \* (2) Leader modifies its deadline to be larger than the last released entry
     \*     so as to make it eligible for entering the early buffer
     \/ /\ EntryLessThan(mlog, LastLog(vLog[r]))
        /\ IF   r = Leader(vViewID[r])  THEN \* this replica is the leader in the current view
                /\ vEarlyBuffer' = [ 
                        vEarlyBuffer EXCEPT ![r] =vEarlyBuffer[r] \cup {[
                            mtype      |-> MLogEntry,
                            clientID   |-> mlog.clientID,
                            requestID  |-> mlog.requestID,
                            deadline   |-> LastLog(vLog[r]).deadline + 1,
                            command    |-> mlog.command
                        ]} 
                   ]
                /\ UNCHANGED   << networkVars, clientVars, 
                                vLog, vViewID, vReplicaClock,
                                vLastNormView, vViewChanges,vReplicaStatus,
                                vSyncPoint, vLateBuffer, 
                                vTentativeSync, vSyncReps, vCommitPoint, 
                                vUUIDCounter, vCrashVector, 
                                vCrashVectorReps, vRecoveryReps >> 
           ELSE \* this replica is a follower in the current view
                /\ vLateBuffer' = [ 
                        vLateBuffer EXCEPT ![r] =vLateBuffer[r] \cup { mlog } 
                   ]

                /\ UNCHANGED   << networkVars, clientVars, 
                               vLog, vEarlyBuffer, vViewID, vReplicaClock,
                               vLastNormView, vViewChanges,vReplicaStatus,
                               vSyncPoint, vTentativeSync, 
                               vSyncReps, vCommitPoint,  
                               vUUIDCounter, vCrashVector, 
                               vCrashVectorReps, vRecoveryReps >> 

                                              
\* Release relevant requests from vEarlyBuffer and append to vLog, 
\* and then send a fast reply
FlushEarlyBuffer(r) ==
    LET 
       validLogSet == {x \in vEarlyBuffer[r]: 
                         /\ x.deadline < vReplicaClock[r] \* < rather than <= 
                         /\ EntryLessThan(LastLog(vLog[r]), x) }
       validLogs == GetSortLogSeq(validLogSet)
       newLogStart == Len(vLog[r]) + 1
    IN
    /\  vLog' = [vLog EXCEPT ![r] = vLog[r] \o validLogs ]
    /\  vEarlyBuffer' = [ vEarlyBuffer EXCEPT ![r] 
                        = {x \in vEarlyBuffer[r]: x.deadline >= vReplicaClock[r] } ] \* >= rather than >
    /\  Send({[ mtype       |-> MFastReply,
                sender      |-> r,
                dest        |-> vLog'[r][i].clientID,
                viewID      |-> vViewID[r],
                requestID   |-> vLog'[r][i].requestID,
                hash        |-> [
                                 log |-> SubSeq(vLog'[r], 1, i),
                                 cv  |-> vCrashVector
                                 ],
                deadline    |-> vLog'[r][i].deadline,
                logSlotNum  |-> i
               ] : i \in newLogStart..Len(vLog'[r])})
    /\  IF r = Leader(vViewID[r])  THEN 
            /\ vSyncPoint' =  [ vSyncPoint EXCEPT ![r] = Len(vLog'[r]) ]
            /\ UNCHANGED   <<  clientVars, vViewID, vLastNormView, vViewChanges,
                               vReplicaStatus, vReplicaClock, vLateBuffer,
                               vTentativeSync, vSyncReps, vCommitPoint,
                               vUUIDCounter, vCrashVector, 
                               vCrashVectorReps, vRecoveryReps >> 
        ELSE
            UNCHANGED   << clientVars, vViewID, vLastNormView, vViewChanges,
                           vReplicaStatus, vReplicaClock, 
                           vSyncPoint, vLateBuffer, 
                           vTentativeSync, vSyncReps, vCommitPoint,
                           vUUIDCounter, vCrashVector, 
                           vCrashVectorReps, vRecoveryReps  >> 

\* Clock can be random value (RandomElement(1..MaxTime)),
\* because clock sync algorithm can give negative offset, or even fails 
\* But Nezha depend on clock for performance but not for correctness                               
\* If the replica clock goes beyond MaxTime, it will stop processing
\* Since Clock is moved, then replicas can release relevant requests and append to logs                       
ReplicaClockMove(r) ==/\ IF vReplicaClock[r] < MaxTime THEN
                            vReplicaClock' =  [ 
                                vReplicaClock EXCEPT ![r] = RandomElement(1..MaxTime)
                            ]
                         ELSE  
                            UNCHANGED vReplicaClock
                      /\ UNCHANGED << networkVars, clientVars, 
                                      vLog, vEarlyBuffer,vViewID, 
                                      vLastNormView, vViewChanges, vReplicaStatus,
                                      vSyncPoint, vLateBuffer, vTentativeSync,
                                      vSyncReps,vCommitPoint,
                                      vUUIDCounter, vCrashVector, 
                                      vCrashVectorReps, vRecoveryReps >>
\* Client clock move does not change any other things
ClientClockMove(c) == /\  IF vClientClock[c] < MaxTime THEN
                            vClientClock' = [
                                vClientClock EXCEPT ![c] = RandomElement(1..MaxTime)
                            ]
                          ELSE
                            UNCHANGED vClientClock
                      /\  UNCHANGED <<networkVars, replicaVars, vClientReqNum>>

                      
 
--------------------------------------------------------------------------------
\* `^\textbf{\large Index Synchronization to Fix Set Inequality}^'

\* Leader replica r starts index synchronization
StartIndexSync(r) ==
  LET 
    indices == { GetLogIndex(vLog[r][i]) : i \in 1..Len(vLog[r]) }
  IN
  /\ r = Leader(vViewID[r])
  /\ vReplicaStatus[r]  = StNormal
  /\ Cardinality(indices) > 0  \* leader has log entries to sync
  /\ Send({[ mtype      |-> MIndexSync,
             sender     |-> r,
             dest       |-> d,
             viewID     |-> vViewID[r],
             logindcies |-> indices ] : d \in Replicas })
  /\ UNCHANGED << clientVars, replicaVars >>

                       
GetSyncLogs(logSeq, indices) == 
    LET
        logSet == { l \in Seq2Set(logSeq) : \E index \in indices: EntryEq(index, l)}
    IN
        GetSortLogSeq(logSet)

GetUnSyncLogs(logSeq, lastSyncedLog) == 
    LET
        logSet == { l \in Seq2Set(logSeq) : EntryLessThan(lastSyncedLog, l) }
    IN
        GetSortLogSeq(logSet)
        
\* Replica r receives IndexSync message, m
HandleIndexSync(r, m) ==
  /\ r /= Leader(vViewID[r])
  /\ vReplicaStatus[r] = StNormal
  /\ m.viewID = vViewID[r]
  /\ m.sender = Leader(vViewID[r])
  /\ vSyncPoint[r] < Len(m.logindcies)
  /\ LET 
        entries == { vLog[r][i] : i \in 1..Len(vLog[r]) }
        indices == { GetLogIndex(vLog[r][i]) : i \in 1..Len(vLog[r]) }
        missedEntries == m.indices \ indices
     IN
        \* Missing some log entries -> Send MMissEntryRequest
        IF Cardinality(missedEntries) > 0 THEN
            /\ Send({[  mtype      |-> MMissEntryRequest,
                        sender     |-> r,
                        dest       |-> d,
                        viewID     |-> vViewID[r],
                        miss       |-> missedEntries ] : d \in (Replicas \ {r} ) })
            /\ UNCHANGED << vLog, vSyncPoint >>
        \* No missing entries, update vLog and vSyncPoint, and send relevant slow replies
        ELSE
            LET 
                syncLogs ==  GetSyncLogs(vLog[r], indices)
                unsyncLogs ==  GetUnSyncLogs(vLog[r], LastLog(syncLogs))
            IN
            /\ vLog' = [ vLog EXCEPT ![r] = syncLogs \o unsyncLogs ]
            /\ vSyncPoint' = [ vSyncPoint EXCEPT ![r] = Len(syncLogs) ]
            /\ Send({[   mtype      |-> MSlowReply,
                         sender     |-> r,
                         dest       |-> vLog'[r][i].clientID,
                         viewID     |-> vViewID[r],
                         requestID  |-> vLog'[r][i].requestID,
                         logSlotNum |-> i ] : i \in (1..Len(syncLogs))})
            
  /\ UNCHANGED << clientVars, vEarlyBuffer, vViewID, vReplicaClock, 
                 vLastNormView, vViewChanges,  vReplicaStatus, 
                 vLateBuffer, vTentativeSync, vSyncReps, vCommitPoint,
                 vUUIDCounter, vCrashVector, 
                 vCrashVectorReps, vRecoveryReps>>


FindEntries(log, indices)== 
    { l \in Seq2Set(log)  : \E x \in indices: IndexEq(l,x) }

\* Replica r receives a request from other replicas, asking for a missing log entry
HandleMissEntryRequest(r, m) == 
  /\ m.viewID = vViewID[r]
  /\ LET 
        findentries == FindEntries(vLog[r], m.miss)
     IN
     /\ Cardinality(findentries) > 0
     /\ Send({[   mtype      |-> MMissEntryReply,
                  sender     |-> r,
                  dest       |-> m.sender,
                  viewID     |-> vViewID[r],
                  entries    |-> findentries ]})
     /\ UNCHANGED << clientVars, replicaVars >>  
       

   
\* Replica r receives a reply from other replicas, providing the missing entries
HandleMissEntryReply(r, m) == 
    /\ m.viewID = vViewID[r]
    /\ LET
        mergedSet == Seq2Set(vLog[r]) \union m.entries 
       IN
        vLog' = [ vLog EXCEPT ![r] = GetSortLogSeq(mergedSet) ]
    /\ UNCHANGED << networkVars, clientVars, 
                    vEarlyBuffer,vViewID, vReplicaClock, 
                    vLastNormView, vViewChanges, vReplicaStatus,
                    vSyncPoint, vLateBuffer, 
                    vTentativeSync,vSyncReps, vCommitPoint,
                    vUUIDCounter, vCrashVector, 
                    vCrashVectorReps, vRecoveryReps >>

                       
--------------------------------------------------------------------------------
\* `^\textbf{\large Replica Rejoin}^'
\* Failed replica loses all states
StartReplicaFail(r) == 
    /\ NumofReplicas(StRecovering) < F \* We assume at most F replicas can fail at the same time
    /\ vReplicaStatus' = [ vReplicaStatus EXCEPT ![r] = StRecovering ]
    /\ vLog' = [ vLog EXCEPT ![r] = <<>> ]
    /\ vEarlyBuffer' = [ vEarlyBuffer EXCEPT ![r] = {} ]
    /\ vViewID' = [vViewID EXCEPT![r] = 1 ]
    /\ vLastNormView'   = [ vLastNormView EXCEPT ![r] = 1 ]
    /\ vViewChanges' = [ vViewChanges EXCEPT ![r] = {} ]
    /\ vSyncPoint' = [ vSyncPoint EXCEPT ![r] = 0 ]
    /\ vLateBuffer' = [ vLateBuffer EXCEPT ![r] = {} ]
    /\ vTentativeSync' = [ vTentativeSync EXCEPT ![r] = 0 ]
    /\ vSyncReps' = [ vSyncReps EXCEPT ![r] = {} ]
    /\ vCommitPoint' = [ vCommitPoint EXCEPT ![r] = 0 ]
    /\ vCrashVector' = [ vCrashVector EXCEPT ![r] =  [ rr \in Replicas |-> 0] ]
    /\ vCrashVectorReps' = [ vCrashVectorReps EXCEPT ![r] = {} ]
    /\ vRecoveryReps' = [ vRecoveryReps EXCEPT ![r] = {} ]
    /\ UNCHANGED << vReplicaClock, vUUIDCounter, clientVars, networkVars >>




                       
\* Recovering replica starts recovery (by first sending CrashVectorReq)
StartReplicaRecovery(r) ==
    /\ vReplicaStatus[r] = StRecovering
    /\ vUUIDCounter' = [ vUUIDCounter EXCEPT ![r] = vUUIDCounter[r] + 1 ]
    /\ Send({[ mtype  |-> MCrashVectorReq,
               sender |-> r,
               dest   |-> d,
               nonce  |-> vUUIDCounter'[r] ] : d \in Replicas})
    /\ UNCHANGED << vLog, vEarlyBuffer, vViewID, vReplicaClock,
                    vLastNormView, vViewChanges,vReplicaStatus,
                    vSyncPoint, vLateBuffer,
                    vTentativeSync, vSyncReps, vCommitPoint, 
                    vCrashVector, vCrashVectorReps, vRecoveryReps,
                    clientVars  >>
                       
                       
HandleCrashVectorReq(r, m) ==
    /\ vReplicaStatus[r] = StNormal
    /\ Send({[ mtype  |-> MCrashVectorRep,
               sender |-> r,
               dest   |-> m.sender,
               nonce  |-> m.nonce,
               cv     |-> vCrashVector[r] ]})
    /\ UNCHANGED << replicaVars,  clientVars >>
    


HandleCrashVectorRep(r, m) ==
    /\ vReplicaStatus[r] = StRecovering
    /\ vUUIDCounter[r] = m.nonce
    /\ Cardinality(vCrashVectorReps[r]) <= F
    /\ ~DuplicateRep(vCrashVectorReps[r],m)
    /\ vCrashVectorReps' = [ vCrashVectorReps EXCEPT ![r] = vCrashVectorReps[r] \cup {m} ]
    /\ vCrashVector' = [ vCrashVector EXCEPT ![r] = MergeCrashVector(vCrashVector[r], m.cv) ] 
    /\ IF Cardinality(vCrashVectorReps') = F + 1 THEN  \* got enough replies and can settle down cv
        Send({[ mtype  |-> MRecoveryReq,
                sender |-> r,
                dest   |-> d,
                nonce  |-> m.nonce,
                cv     |-> vCrashVector'[r] ]: d \in Replicas })
       ELSE
        UNCHANGED << networkVars >>

    /\ UNCHANGED <<vLog, vEarlyBuffer, vViewID, vReplicaClock,
                    vLastNormView, vViewChanges,vReplicaStatus,
                    vSyncPoint, vLateBuffer,
                    vTentativeSync, vSyncReps, vCommitPoint, 
                    vUUIDCounter, vRecoveryReps,
                    clientVars >>



HandleRecoveryReq(r, m) == 
    /\ vReplicaStatus[r] = StNormal
    /\ vCrashVector' = [ vCrashVector EXCEPT ![r] = MergeCrashVector(vCrashVector[r], m.cv) ] 
    /\ Send({[  mtype  |-> MRecoveryRep,
                sender |-> r,
                dest   |-> m.sender,
                viewID |-> vViewID[r],
                cv     |-> vCrashVector'[r] ]: d \in Replicas })

    /\ UNCHANGED  << vLog, vEarlyBuffer, vViewID, vReplicaClock,
                    vLastNormView, vViewChanges,vReplicaStatus,
                    vSyncPoint, vLateBuffer,
                    vTentativeSync, vSyncReps, vCommitPoint, 
                    vUUIDCounter, vCrashVectorReps, vRecoveryReps,
                    clientVars   >>



HandleRecoveryRep(r, m) ==
    /\ vReplicaStatus[r] = StRecovering
    /\ Cardinality(vRecoveryReps[r]) <= F
    /\ ~DuplicateRep(vRecoveryReps[r], m.sender)
    /\ CheckCrashVector(m, r)
(* `~
    /\ vRecoveryReps' = [ vRecoveryReps EXCEPT 
                          ![r] = vRecoveryReps[r] \cup {m}  ]
~'
*)
\* Note: After crash vector is updated, those previously accepted messages may also become stray message.
\* Those messages should also be filtered out.
    /\ vRecoveryReps' = [ vRecoveryReps EXCEPT 
                          ![r] = FilterStrayMessage(vRecoveryReps[r] \cup {m}, vCrashVector'[r] )  ]
           
    /\ IF Cardinality(vRecoveryReps') = F + 1 THEN  \* got enough replies
        LET 
            newView == Max({ mm.viewID : mm \in vRecoveryReps'[r] })
            leaderId == newView % Cardinality(Replicas)
        IN 
            Send({[ mtype  |-> MStateTransferReq,
                    sender |-> r,
                    dest   |-> leaderId,
                    cv     |-> vCrashVector'[r] ]: d \in Replicas })
       ELSE
        UNCHANGED << networkVars >>

    /\ UNCHANGED <<vLog, vEarlyBuffer, vViewID, vReplicaClock,
                    vLastNormView, vViewChanges,vReplicaStatus,
                    vSyncPoint, vLateBuffer,
                    vTentativeSync, vSyncReps, vCommitPoint, 
                    vUUIDCounter, vCrashVectorReps,
                    clientVars >>



HandleStateTransferReq(r, m) == 
    /\ vReplicaStatus[r] = StNormal
    /\ CheckCrashVector(m, r)
    /\ Send({[  mtype  |-> MStateTransferRep,
                sender |-> r,
                dest   |-> m.sender,
                log    |-> vLog[r],
                sp     |-> vSyncPoint[r],
                cp     |-> vCommitPoint[r],
                cv     |-> vCrashVector'[r] ]})
    /\ UNCHANGED  << vLog, vEarlyBuffer, vViewID, vReplicaClock,
                    vLastNormView, vViewChanges,vReplicaStatus,
                    vSyncPoint, vLateBuffer,
                    vTentativeSync, vSyncReps, vCommitPoint, 
                    vUUIDCounter, vCrashVectorReps, vRecoveryReps,
                    clientVars >>


HandleStateTransferRep(r, m) == 
    /\ vReplicaStatus[r] = StRecovering
    /\ CheckCrashVector(m, r)
    /\ vLog' = [ vLog EXCEPT ![r] = m.log ]
    /\ vSyncPoint' = [ vSyncPoint EXCEPT ![r] = m.sp ]
    /\ vCommitPoint' = [ vCommitPoint EXCEPT ![r] = m.cp ]
    /\ vViewID' = [ vViewID EXCEPT  ![r] = m.viewID ]
    /\ vEarlyBuffer' = [ vEarlyBuffer EXCEPT ![r] = {} ] 
    /\ vLastNormView' = [ vLastNormView EXCEPT ![r] = m.viewID ]
    /\ vViewChanges' = [vViewChanges EXCEPT ![r] = {} ]
    /\ vReplicaStatus' = [ vReplicaStatus EXCEPT ![r] = StNormal ]
    /\ vLateBuffer' = [ vLateBuffer EXCEPT ![r] = {} ]
    /\ vTentativeSync' = [ vTentativeSync EXCEPT ![r] = m.sp ]
    /\ vSyncReps' = [ vSyncReps EXCEPT ![r] = {} ]
    /\ vCrashVectorReps' = [ vCrashVectorReps EXCEPT ![r] = {} ]
    /\ vRecoveryReps' = [ vRecoveryReps EXCEPT ![r]= {} ]
    /\ UNCHANGED  << vReplicaClock, vUUIDCounter, clientVars >>


 
--------------------------------------------------------------------------------
\* `^\textbf{\large Leader Change}^'

\* Replica r starts a Leader change
StartLeaderChange(r) ==
  /\ Send({[ mtype  |-> MViewChangeReq,
             sender |-> r,
             dest   |-> d,
             viewID |-> vViewID[r] + 1,
             cv     |-> vCrashVector[r] ] : d \in Replicas})
  /\ UNCHANGED << replicaVars, clientVars >>
  
  
\* `^\textbf{View Change Handlers}^'
\* Replica r gets MViewChangeReq, m
HandleViewChangeReq(r, m) ==
  LET
    currentViewID == vViewID[r]
    newViewID     == Max({currentViewID, m.viewID})
    newLeaderNum  == LeaderID(newViewID)
  IN
  \* Recovering replica does not participate in view change
  /\ vReplicaStatus[r] /= StRecovering
  /\ currentViewID   /= newViewID
  /\ CheckCrashVector(m, r)
  /\ vReplicaStatus' = [ vReplicaStatus EXCEPT ![r] = StViewChange ]
  /\ vViewID'        = [ vViewID EXCEPT ![r] = newViewID ]
  /\ vViewChanges'   = [ vViewChanges EXCEPT ![r] = {} ]
  /\ Send({[ mtype      |-> MViewChange,
             dest       |-> Leader(newViewID),
             sender     |-> r,
             viewID     |-> newViewID,
             lastNormal |-> vLastNormView[r],
             syncedLog  |-> SubSeq(vLog[r], 1, vSyncPoint[r]),
             unsyncedLog|-> SubSeq(vLog[r], vSyncPoint[r]+1, Len(vLog[r])),
             cv         |-> vCrashVector[r] ]} \cup
           \* Send the MViewChangeReqs in case this is an entirely new view
           {[ mtype  |-> MViewChangeReq,
              sender |-> r,
              dest   |-> d,
              viewID |-> newViewID,
              cv     |-> vCrashVector[r] ] : d \in Replicas})
  /\ UNCHANGED << clientVars, vLog, vEarlyBuffer, vReplicaClock,
                  vLastNormView, vSyncPoint, vLateBuffer, 
                  vTentativeSync, vSyncReps, vCommitPoint,
                  vUUIDCounter, vCrashVectorReps, vRecoveryReps >>

                       
\* Replica r receives MViewChange, m
HandleViewChange(r, m) ==
  \* Recovering replica does not participate in view change
  /\ vReplicaStatus[r] /= StRecovering
  \* Add the message to the log
  /\ vViewID[r]         = m.viewID
  /\ vReplicaStatus[r]  = StViewChange
  \* This replica is the leader
  /\ Leader(vViewID[r]) = r
  /\ CheckCrashVector(m, r)
(* `~
  /\ vViewChanges' = [ vViewChanges EXCEPT ![r] = vViewChanges[r] \cup {m}]
~'
*)
  \* Note: Similar to vRecoveryReps, (potential) stray messages should be filtered out.
  /\ vViewChanges' = [ vViewChanges EXCEPT 
                       ![r] = FilterStrayMessage(vViewChanges[r] \cup {m}, vCrashVector'[r]) ]
  \* If there's enough replies, start the new view
  /\ LET
       isViewPromise(M) == /\ { n.sender : n \in M } \in Quorums
                           /\ \E n \in M : n.sender = r
       vCMs             == { n \in vViewChanges'[r] :
                               /\ n.mtype  = MViewChange
                               /\ n.viewID = vViewID[r] }
       \* Create the state for the new view
       normalViews  == { n.lastNormal : n \in vCMs }
       \* Choose the largest normal view (i.e. the newest)
       lastNormal     == (CHOOSE v \in normalViews : \A v2 \in normalViews : v2 <= v)
       \* For logs before vSyncPoint (i.e. syncedLog), we directly copy from the bestCandiates
       \* For unsyncedLog, we do quorum check to decide which ones should be added to recovery Log
       goodCandidates ==  { o \in vCMs : o.lastNormal = lastNormal }
       \* bestCandidate can only be picked from goodCandidates, 
       \* because previous views may include invalid logs
       bestCandidate  == CHOOSE n \in goodCandidates: 
                            \A y \in goodCandidates: Len(n.syncedLog) >= Len(y.syncedLog)
       unSyncedLogs   == { Seq2Set(n.unsyncedLog) : n \in goodCandidates }

     IN
       IF isViewPromise(vCMs) THEN
         Send({[ mtype      |-> MStartView,
                 dest       |-> d,
                 viewID     |-> vViewID[r],
                 log        |-> bestCandidate.syncedLog 
                                \o MergeUnSyncLogs(unSyncedLogs, LastLog(bestCandidate.syncedLog))
               ] : d \in Replicas })
       ELSE
         UNCHANGED networkVars
  /\ UNCHANGED << clientVars,  vLog,  vEarlyBuffer, vViewID, vReplicaClock, 
                  vLastNormView, vReplicaStatus, vSyncPoint, vLateBuffer,
                  vTentativeSync, vSyncReps,vCommitPoint, 
                  vUUIDCounter, vCrashVectorReps, vRecoveryReps >>


                       
\* Replica r receives a MStartView, m
HandleStartView(r, m) ==
  /\ vReplicaStatus[r] /= StRecovering
  /\ \/ vViewID[r]   < m.viewID
     \/ vViewID[r]   = m.viewID /\ vReplicaStatus[r] = StViewChange
  /\ CheckCrashVector(m, r)
  /\ vLog'           = [ vLog EXCEPT ![r] = m.log ]
  /\ vReplicaStatus' = [ vReplicaStatus EXCEPT ![r] = StNormal ]
  /\ vViewID'        = [ vViewID EXCEPT ![r] = m.viewID ]
  /\ vLastNormView'  = [ vLastNormView EXCEPT ![r] = m.viewID ]
  /\ vEarlyBuffer' = [ vEarlyBuffer EXCEPT ![r] = {} ] \* clear Early Buffer for the new view
  /\ vLateBuffer' = [ vLateBuffer EXCEPT ![r] = {}] \* clear Late Buffer for the new view
  /\ vSyncPoint' = [ vSyncPoint EXCEPT![r] = Len(m.log) ]  
  /\ vTentativeSync' = [ vTentativeSync EXCEPT![r] = Len(m.log) ]
  \* Send replies (in the new view) for all log items
  /\ IF r = Leader(m.viewID) THEN   \* Leader only sends fast reply
        Send({[  mtype      |-> MFastReply,
                 sender     |-> r,
                 dest       |-> m.log[i].clientID,
                 viewID     |-> m.viewID,
                 requestID  |-> m.log[i].requestID,
                 hash       |-> [
                                    log |-> SubSeq(m.log, 1, i),
                                    cv  |-> vCrashVector
                                ],
                 deadline   |-> m.log[i].deadline,
                 logSlotNum |-> i ] : i \in (1..Len(m.log))})
     ELSE \* While staring view, followers knows the log is synced with the leader, so send slow-reply
        Send({[  mtype      |-> MSlowReply,
                 sender     |-> r,
                 dest       |-> m.log[i].clientID,
                 viewID     |-> m.viewID,
                 requestID  |-> m.log[i].requestID,
                 logSlotNum |-> i ] : i \in (1..Len(m.log))})
  /\ UNCHANGED << clientVars, vReplicaClock, vViewChanges, 
                  vSyncReps, vCommitPoint, vCrashVector,
                  vUUIDCounter, vCrashVectorReps, vRecoveryReps >> 
                       
--------------------------------------------------------------------------------
\* `^\textbf{\large Periodic Synchronization}^'
\* Leader replica r conduct synchronization periodically
\* This periodic sync process is different from index sync process
\* It ensures that all replicas’ logs are stable up to their CommitPoint (for fast recovery)
\* Our CommitPoint is essentially the `^\emph{sync-point}^' defined in NOPaxos paper 
\* Just as mentioned in NOPaxos paper, it is an optional optimization for fast recovery
\* Nezha still works even without this part
StartSync(r) ==
  /\ Leader(vViewID[r]) = r
  /\ vReplicaStatus[r]  = StNormal
  /\ vTentativeSync[r] < Len(vLog[r])  \* If >= then no need to sync
  /\ vSyncReps'         = [ vSyncReps EXCEPT ![r] = {} ]
  /\ vTentativeSync'    = [ vTentativeSync EXCEPT ![r] = Len(vLog[r]) ]
  /\ Send({[ mtype      |-> MSyncPrepare,
             sender     |-> r,
             dest       |-> d,
             viewID     |-> vViewID[r],
             log        |-> vLog[r] ] : d \in Replicas })
  /\ UNCHANGED << clientVars, vLog, vEarlyBuffer, vViewID, vReplicaClock, 
                  vLastNormView, vViewChanges, vReplicaStatus,
                  vSyncPoint, vLateBuffer, vCommitPoint,
                  vUUIDCounter, vCrashVector, 
                  vCrashVectorReps, vRecoveryReps >>

                       
            
\* Replica r receives MSyncPrepare, m
HandleSyncPrepare(r, m) ==
  LET
    newLog == m.log \o GetUnSyncLogs(vLog[r], LastLog(m.log) )
  IN
  /\ vReplicaStatus[r] = StNormal
  /\ m.viewID          = vViewID[r]
  /\ m.sender          = Leader(vViewID[r])
  /\ IF     vSyncPoint[r]  < Len(m.log) THEN
            /\ vSyncPoint' = [vSyncPoint EXCEPT ![r] = Len(m.log)]
            /\ vLog'       = [ vLog EXCEPT ![r] = newLog ]
            /\ Send({[   mtype      |-> MSlowReply,
                         sender     |-> r,
                         dest       |-> m.log[i].clientID,
                         viewID     |-> m.viewID,
                         requestID  |-> m.log[i].requestID,
                         logSlotNum |-> i ] : i \in (1..Len(m.log))})
     ELSE
            UNCHANGED <<vLog, vSyncPoint >>
  /\ Send({[ mtype         |-> MSyncRep,
             sender        |-> r,
             dest          |-> m.sender,
             viewID        |-> vViewID[r],
             logSlotNumber |-> Len(m.log) ]}
          )
  /\ UNCHANGED <<clientVars, vEarlyBuffer, vViewID,  vReplicaClock,
                 vLastNormView, vViewChanges, vReplicaStatus, 
                 vLateBuffer, vTentativeSync, vSyncReps, vCommitPoint,
                 vUUIDCounter, vCrashVector, 
                 vCrashVectorReps, vRecoveryReps>>

                       
\* Replica r receives MSyncRep, m
HandleSyncRep(r, m) ==
  /\ m.viewID          = vViewID[r]
  /\ vReplicaStatus[r] = StNormal
  /\ vSyncReps'        = [ vSyncReps EXCEPT ![r] = vSyncReps[r] \cup { m } ]
  /\ LET isViewPromise(M) == /\ { n.sender : n \in M } \in Quorums
                             /\ \E n \in M : n.sender = r
         sRMs             == { n \in vSyncReps'[r] :
                                 /\ n.mtype         = MSyncRep
                                 /\ n.viewID        = vViewID[r]
                                 /\ n.logSlotNumber = vTentativeSync[r] }
         committedLog     == IF vTentativeSync[r] >= 1 THEN
                               SubSeq(vLog[r], 1, vTentativeSync[r])
                             ELSE
                               << >>
     IN
       IF isViewPromise(sRMs) THEN
         /\ Send({[ mtype         |-> MSyncCommit,
                    sender        |-> r,
                    dest          |-> d,
                    viewID        |-> vViewID[r],
                    log           |-> committedLog] :
                    d \in Replicas })
         /\ vCommitPoint' =  [ vCommitPoint EXCEPT ![r] = vTentativeSync[r] ]
       ELSE
         UNCHANGED << networkVars, vCommitPoint >>
  /\ UNCHANGED  << clientVars, vLog, vEarlyBuffer, vViewID,
                   vReplicaClock, vLastNormView, vViewChanges,  
                   vReplicaStatus, vSyncPoint, vLateBuffer, 
                   vTentativeSync, vUUIDCounter, vCrashVector, 
                   vCrashVectorReps, vRecoveryReps >>


\* Replica r receives MSyncCommit, m
HandleSyncCommit(r, m) ==
  LET
    newLog == m.log \o GetUnSyncLogs(vLog[r], LastLog(m.log) )
  IN
  /\ vReplicaStatus[r] = StNormal
  /\ m.viewID          = vViewID[r]
  /\ m.sender          = Leader(vViewID[r])
  /\ IF  Len(m.log) <=  vCommitPoint[r] THEN
        UNCHANGED <<vCommitPoint, vLog>>
     ELSE
        /\ vLog'        = [ vLog EXCEPT ![r] = newLog ]
        /\ vCommitPoint'  = [ vCommitPoint  EXCEPT ![r] = Len(m.log) ]
        /\ Send({[ mtype      |-> MSlowReply,
                   sender     |-> r,
                   dest       |-> m.log[i].clientID,
                   viewID     |-> m.viewID,
                   requestID  |-> m.log[i].requestID,
                   logSlotNum |-> i ] : i \in (1..Len(m.log))})
  /\ UNCHANGED << networkVars, clientVars,  vEarlyBuffer, 
                  vViewID,  vReplicaClock,  vLastNormView, vViewChanges, 
                  vReplicaStatus, vSyncPoint, vLateBuffer, 
                  vTentativeSync, vSyncReps, 
                  vUUIDCounter, vCrashVector, 
                  vCrashVectorReps, vRecoveryReps >>

--------------------------------------------------------------------------------
(* `^\textbf{\large Invariants and Helper Functions}^' *)
    
(*
  A request/log is committed in two possible cases:
  (1) A fast quorum has sent either slow-reply messages, or fast-reply messages with consistent hashes [Fast Path]
  (2) A simple quorum has sent slow-reply messages [Slow Path]
  Both quorums should include the leader
*)

\* Check whether log <clientID, requestID> is committed at position logSlotNum
Committed(clientID, requestID, logSlotNum) ==
    \* Fast path
    \/ \E M \in SUBSET ({m \in messages : /\ \/ m.mtype = MFastReply
                                             \/ m.mtype = MSlowReply 
                                          /\ m.logSlotNum = logSlotNum
                                          /\ m.dest = clientID 
                                          /\ m.requestID = requestID }) :
        \* Sent from a fast quorum
        /\ { m.sender : m \in M } \in FastQuorums
        \* Matching view-id
        /\ \E m1 \in M : \A m2 \in M : m1.viewID = m2.viewID
        \* One from the leader
        /\ \E m \in M : m.sender = Leader(m.viewID)
        \* Hash values are consistent
        /\  LET 
                leaderReply == CHOOSE m \in M : m.sender = Leader(m.viewID)
            IN
            \A m1 \in M : IF m1.mtype = MFastReply THEN 
                             m1.hash = leaderReply.hash 
                          ELSE 
                             TRUE  \* SlowReply has consistent hash for sure
    \* Slow path
    \/ \E M \in SUBSET ({m \in messages : /\ \/ m.mtype = MSlowReply
                                             \/ /\ m.mtype = MFastReply  \* Leader only sends fast-reply
                                                /\ m.sender =Leader(m.viewID)
                                          /\ m.logSlotNum = logSlotNum
                                          /\ m.dest = clientID
                                          /\ m.requestID = requestID }) : 
        /\ { m.sender : m \in M } \in Quorums
        \* Matching view-id
        /\ \E m1 \in M : \A m2 \in M : m1.viewID = m2.viewID
        \* One from the leader
        /\ \E m \in M : m.sender = Leader(m.viewID)
 
 
 \* Check whether log <clientID, requestID> is committed in view viewID
 CommittedInView(clientID, requestID, viewID) ==
    \* Fast path
    \/ \E M \in SUBSET ({m \in messages : /\ \/ m.mtype = MFastReply
                                             \/ m.mtype = MSlowReply 
                                          /\ m.dest = clientID
                                          /\ m.requestID = requestID
                                          /\ m.viewID = viewID}) :
        \* Sent from a fast quorum
        /\ { m.sender : m \in M } \in FastQuorums
        \* One from the leader
        /\ \E m \in M : m.sender = Leader(m.viewID)
        \* Hash values are the same
        /\  LET 
                leaderReply == CHOOSE m \in M : m.sender = Leader(m.viewID)
            IN
            \A m1 \in M : IF m1.mtype = MFastReply THEN 
                             m1.hash = leaderReply.hash 
                          ELSE 
                             TRUE  \* SlowReply has consistent hash for sure
    \* Slow path
    \/ \E M \in SUBSET ({m \in messages : /\ \/ m.mtype = MSlowReply
                                             \/ /\ m.mtype = MFastReply  \* Leader only sends fast-reply
                                                /\ m.sender = Leader(m.viewID)
                                          /\ m.dest = clientID
                                          /\ m.requestID = requestID
                                          /\ m.viewID = viewID}) : 
        /\ { m.sender : m \in M } \in Quorums
        \* Hash values are the same
        /\ \E m1 \in M : \A m2 \in M : m1.hash = m2.hash
        \* One from the leader
        /\ \E m \in M : m.sender = Leader(m.viewID)
              
                
SystemRecovered(viewID) ==  /\ \E RM \in SUBSET(Replicas): 
                               /\ Cardinality(RM) >= QuorumSize
                               /\ \A r \in RM: vLastNormView[r] >= viewID
                               /\ \A r \in RM: vReplicaStatus[r] = StNormal \* These replicas must be normal
                            \* The leader of this view has also recovered or even goes beyond this view 
                            /\  vLastNormView[Leader(viewID)] >= viewID

(* `^\textbf{Invariants}^' *)
\* Durability: Committed Requests always survive failure
\* i.e. If a request is committed in one view, then it will remain committed in the higher views
\* One thing to note, the check of "committed" only happens when the system is still "normal"
\* While the system is under recovery (i.e. less than f+1 replicas are normal), 
\* the check of committed does not make sense
Durability == \A v1, v2 \in 1..MaxViews:
                \* If a request is committed in lower view (v1,), 
                \* it is impossible to make this request uncommited in higher view (v2)
                   ~(/\ v1 < v2 
                     \* To check Durability of request in higher views, 
                     \* the system should have entered the higher views
                     /\ SystemRecovered(v2)
                     /\ \E c \in Clients :
                        \E r \in 1..MaxReqNum:
                            /\ CommittedInView(c,r, v1)
                            /\ ~CommittedInView(c,r, v2))

\* Consistency: Committed requests have the same history even after view changes
\* i.e. If a request is committed in a lower view (v1), then (based on Durability Property)
\* it remains committed in higher view (v2)
\* Consistency requires the history of the request (i.e. all the request before this request) remain the same                         
Consistency == 
     \A v1, v2 \in 1..MaxViews:   
              ~(/\ v1 < v2
                \* To check Consistency of request in higher views, 
                \* the system should have entered the higher views
                /\ SystemRecovered(v2) 
                /\ \E c \in Clients :
                   \E r \in 1..MaxReqNum:
                   \E t \in 1..MaxTime:
                     \* Durability has been checked in another invariant
                     /\ CommittedInView(c,r, v1)
                     /\ CommittedInView(c,r, v2)
                     /\ LET 
                            v1LeaderReply == CHOOSE m \in messages: 
                                                /\ m.mtype = MFastReply
                                                /\ m.deadline = t
                                                /\ m.dest = c 
                                                /\ m.requestID = r
                                                /\ m.viewID = v1
                                                /\ m.sender = Leader(v1) 
                            v2LeaderReply == CHOOSE m \in messages: 
                                                /\ m.mtype = MFastReply
                                                /\ m.deadline = t
                                                /\ m.dest = c 
                                                /\ m.requestID = r
                                                /\ m.viewID = v2
                                                /\ m.sender = Leader(v2)                                                                    
                        IN
                           v1LeaderReply.hash /= v2LeaderReply.hash)  
                            
\* Linearizability: Only one request can be committed for a given position
\* i.e. If one request has committed at position i, then no contrary observation can be made
\* i.e. there cannot be a second request committed at the same position
Linearizability ==
  LET
    maxLogPosition == Max({1} \cup
      { m.logSlotNum : m \in {m \in messages : 
                          \/ m.mtype = MFastReply
                          \/ m.mtype = MSlowReply } })
  IN ~(\E c1, c2 \in Clients :
       \E r1, r2 \in 1..MaxReqNum:
         /\ << c1, r1 >> /= << c2, r2 >>
         /\ \E i \in (1 .. maxLogPosition) :
            /\ Committed(c1, r1, i)
            /\ Committed(c2, r2, i)
      )

(* `~
SyncSafety == \A r \in Replicas :
              \A i \in 1..vSyncPoint[r] :
              IF SystemRecovered(vViewID[r]) THEN
                \* Committed can only be checked when the system is recovered 
                \* (i.e. when there are f+1 replicas alive)
                Committed(vLog[r][i].ta,vLog[r][i].clientID, vLog[r][i].reqID, i)
              ELSE
                TRUE
 ~'
 *)               
--------------------------------------------------------------------------------
(* `^\textbf{\large Main Transition Function}^' *)

Next == \* Handle Messages
    \/ \E m \in messages : 
                        /\ m.mtype = MClientRequest
                        /\ m \notin vReplicaProcessed[m.dest]
                        /\ HandleClientRequest(m.dest, m)
                        /\ vReplicaProcessed' = 
                            [vReplicaProcessed EXCEPT ![m.dest] = 
                            vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
                        /\ UNCHANGED vClientProcessed
                        /\ DebugAction' = << "HandleClientRequest", m >>
                            
    \/ \E m \in messages : 
                        /\ m.mtype = MViewChangeReq
                        /\ m \notin vReplicaProcessed[m.dest]
                        /\ HandleViewChangeReq(m.dest, m)  
                        /\ vReplicaProcessed' = 
                            [vReplicaProcessed EXCEPT ![m.dest] = 
                            vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
                        /\ UNCHANGED vClientProcessed
                        /\ DebugAction' = << "HandleViewChangeReq", m >>
                                                                
    \/ \E m \in messages : 
                        /\ m.mtype = MViewChange
                        /\ m \notin vReplicaProcessed[m.dest]
                        /\ HandleViewChange(m.dest, m)
                        /\ vReplicaProcessed' = 
                            [vReplicaProcessed EXCEPT ![m.dest] = 
                            vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
                        /\ UNCHANGED vClientProcessed
                        /\ DebugAction' = << "HandleViewChange", m >>
                            
    \/ \E m \in messages : 
                        /\ m.mtype = MStartView
                        /\ m \notin vReplicaProcessed[m.dest]
                        /\ HandleStartView(m.dest, m)
                        /\ vReplicaProcessed' = 
                            [vReplicaProcessed EXCEPT ![m.dest] = 
                            vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
                        /\ UNCHANGED vClientProcessed
                        /\ DebugAction' = << "HandleStartView", m >>
    
    \/ \E m \in messages : 
                        /\ m.mtype = MSyncPrepare
                        /\ m \notin vReplicaProcessed[m.dest]
                        /\ HandleSyncPrepare(m.dest, m)
                        /\ vReplicaProcessed' = 
                            [vReplicaProcessed EXCEPT ![m.dest] =
                                vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
                        /\ UNCHANGED vClientProcessed
                        /\ DebugAction' = << "HandleSyncPrepare", m >>
                            
    \/ \E m \in messages :
                        /\ m.mtype = MSyncRep
                        /\ m \notin vReplicaProcessed[m.dest]
                        /\ HandleSyncRep(m.dest, m)
                        /\ vReplicaProcessed' = 
                            [vReplicaProcessed EXCEPT ![m.dest] = 
                            vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
                        /\ UNCHANGED vClientProcessed
                        /\ DebugAction' = << "HandleSyncRep", m >>
    \/ \E m \in messages :
                        /\ m.mtype = MSyncCommit
                        /\ m \notin vReplicaProcessed[m.dest]
                        /\ HandleSyncCommit(m.dest, m)
                        /\ vReplicaProcessed' = 
                            [vReplicaProcessed EXCEPT ![m.dest] = 
                            vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
                        /\ UNCHANGED vClientProcessed
                        /\ DebugAction' = << "HandleSyncCommit", m >>
                            
    \/ \E m \in messages:
                        /\ m.mtype = MMissEntryRequest
                        /\ m \notin vReplicaProcessed[m.dest]
                        /\ HandleMissEntryRequest(m.dest, m)
                        /\ vReplicaProcessed' = 
                            [vReplicaProcessed EXCEPT ![m.dest] = 
                             vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
                        /\ UNCHANGED vClientProcessed
                        /\ DebugAction' = << "HandleMissEntryRequest", m >>
                
    \/ \E m \in messages:
                        /\ m.mtype = MMissEntryReply
                        /\ m \notin vReplicaProcessed[m.dest]
                        /\ HandleMissEntryReply(m.dest, m)
                        /\ vReplicaProcessed' = 
                            [vReplicaProcessed EXCEPT ![m.dest] = 
                             vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
                        /\ UNCHANGED vClientProcessed
                        /\ DebugAction' = << "HandleMissEntryReply", m >>
                          
    \* Client Actions
    \/ \E c \in Clients :  
                        /\ vClientReqNum[c] < MaxReqNum
                        /\ ClientSendRequest(c)
                        /\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
                        /\ DebugAction' = << "ClientSendRequest", "" >>                              
                                                                
    \* Start Synchronization
    \/ \E r \in Replicas :  
                        /\ StartSync(r)
                        /\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
                        /\ DebugAction' = << "StartSync", "" >>                
    
    \* Replica Fail
    \/ \E r \in Replicas :
                        /\ vReplicaStatus[r] = StNormal
                        /\ StartReplicaFail(r)
                        /\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
                        /\ DebugAction' = << "StartReplicaFail", "" >>
    
    
    \* Leader Change
    \/ \E r \in Replicas : 
                        /\ vViewID[r] < MaxViews
                        /\ StartLeaderChange(r)
                        /\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
                        /\ DebugAction' = << "StartLeaderChange", "" >>
                        
    \* Replica Rejoin                    
    \/ \E r \in Replicas :
                        /\ vReplicaStatus[r] = StRecovering
                        /\ StartReplicaRecovery(r)
                        /\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
                        /\ DebugAction' = << "StartReplicaRecovery", "" >>
    
    \* Replica Actions:
    \/ \E r \in Replicas:
                        /\ StartIndexSync(r)
                        /\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
                        /\ DebugAction' = << "StartIndexSync", "" >>
                        
    \/ \E r \in Replicas:
                    /\ FlushEarlyBuffer(r)
                    /\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
                    /\ DebugAction' = << "FlushReplicaBuffer", "" >>
    \* Clock Move
    \/ \E r \in Replicas : 
                        /\ ReplicaClockMove(r)                             
                        /\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
                        /\ DebugAction' = << "ReplicaClockMove", "" >>
    
    \/ \E c \in Clients  : 
                        /\ ClientClockMove(c)
                        /\ UNCHANGED << vReplicaProcessed, vClientProcessed >>                          
                        /\ DebugAction' = << "ClientClockMove", "" >>

                    
================================================================================


================================================
FILE: docs/demo.md
================================================

## One-Box Demo
We have prepared the configuration files in ```configs``` folder, these configuration files will be used to launch 3 replicas, 1 proxy and 1 client. Under ```configs``` folder, we have ```local``` folder (for the single-machine test), containing: 

- nezha-replica-config-0.yaml 
- nezha-replica-config-1.yaml
- nezha-replica-config-2.yaml
- nezha-proxy-config.yaml
- nezha-client-config.yaml

When running distributed tests, the user can refer to the template files (e.g., ```configs/nezha-replica-config-template.yaml```) to generate their customized config files (such as configuring the IP addresses in the config files). 

Before running the experiment, we assume the user has generated and copied their configuration files into the ```$HOME/Nezha/configs``` folder.

### View Change Test
**Step 1**: Launch 3 replicas (i.e. replica-0, replica-1, replica-2). Open 3 terminals and launch one replica in each terminal.

```
# In the first terminal (replica-0)
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-0.yaml

# In the second terminal (replica-1)
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-1.yaml

# In the third terminal (replica-2)
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-2.yaml

```

![Step-1](figs/nezha-vr-test-figs/step-1.png)

**Step 2**: After the three replicas are launched, we can see the important information displayed from the console logs, e.g. the current view, the replica id of this replica, the number of replicas, the number of keys the maintained by each replica's state machine (for commutativity optimization)
![Step-2](figs/nezha-vr-test-figs/step-2.png)

**Step 3**: In view 0, the leader replica is ```viewId%replicaNum=0```, i.e. replica-0. Therefore, if we kill replica-0, we will trigger view change, so we use Crtl+C to kill replica-0
![Step-3](figs/nezha-vr-test-figs/step-3.png)


**Step 4**: After leader is killed, the remaining 2 replicas start view change to enter a new view, i.e., view 1. In this new view, the leader becomes ```viewId%replicaNum=1```, i.e., replica-1. Since there are still a majority of replicas (i.e., 2 replicas) alive, the system can resume service.
![Step-4](figs/nezha-vr-test-figs/step-4.png)

**Step 5**: We want the failed replica to rejoin the system. Therefore, we launch replica-0. This time, we set the flag ```isRecovering``` as true, so that it goes through the recovery procedure and retrieves the state from the other healthy replicas.
```
# In the first terminal 
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-0.yaml --isRecovering true
```

![Step-5](figs/nezha-vr-test-figs/step-5.png)

**Step 6**: We can see that replica-0 rejoins the system as a follower, and the current view is 1.
![Step-6](figs/nezha-vr-test-figs/step-6.png)


The test process can be repeated. So long as there are always a majority of replicas (f+1) remaining, then the system is able to serve clients and failed replicas can also rejoin. 

### Test with Client

**Step 0**: Kill all the processes launched in the previous section.

**Step 1**: Similar to the previous section, we launch 3 replicas. More than that, this time we also launch 1 proxy and 1 client. In the client configuration file (i.e. [nezha-client-config.yaml](configs/nezha-client-config.yaml) ), we have specified the client as an open-loop client, and it will submit at about 1000 requests/second. This time we need to open 5 terminals in total.
```
is-openloop: true
poisson-rate: 10
```

```
# In the first terminal (replica-0)
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-0.yaml

# In the second terminal (replica-1)
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-1.yaml

# In the third terminal (replica-2)
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-2.yaml

# In the fourth terminal (proxy)
$HOME/Nezha/bazel-bin/proxy/nezha_proxy --config $HOME/Nezha/configs/local/nezha-proxy-config.yaml

# In the fifth terminal (client-1)
$HOME/Nezha/bazel-bin/client/nezha_client  --config $HOME/Nezha/configs/nezha-client-config.yaml

```


![Step-1](figs/nezha-test-with-client/step-1.png)


**Step 2**: After the client is launched, we can see it continues to submit requests and the proxy continues to forward requests for the client. For every 5 seconds, the client terminal will print a log to show the stats.
![Step-2](figs/nezha-test-with-client/step-2.png)

**Step 3**: While the client is submitting requests, we kill the leader (i.e. replica-0), we can see that the remaining 2 replicas rapidly complete the view change and get the new leader, which takes about ```1657418951138477-1657418950947251=191226us=191ms```. It can complete the view change so fast, because of the optimization of periodical synchronization (which has been explained in our paper). Because of the periodical synchronization, the new leader replica does not need to do state transfer from scratch, it just needs to do state transfer and log merge from the last commit point. 
![Step-3](figs/nezha-test-with-client/step-3.png)



**Step 4**: We want the crashed replica (i.e. replica-0) to rejoin the system. So we set ```isRecovering``` flag as true. 

```
# In the first terminal
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-0.yaml --isRecovering true
```

![Step-4](figs/nezha-test-with-client/step-4.png)


**Step 5**: The crashed replica starts from an empty state, so it needs to retrieve all the log entries in order to recover. Since we are using UDP and by default only fetch 5 entries during each round, the state transfer can take some time if clients have submitted many entries. As shown in the terminal of replica-0, we also print the progress of the recovery. But note that the follower's recovery does not block the other healthy replicas from serving the client. An optional optimization in consideration is to generate snapshot periodically and dump to stable storage. In this way, when a crashed replica wants to recover, it first fetches the state from local storage, and then does state transfer. In this way, it can save the recovery time. 
![Step-5](figs/nezha-test-with-client/step-5.png)


**Step 6**: After replica-0 retrieves all the state, we can see it successfully recover and work as a follower.
![Step-6](figs/nezha-test-with-client/step-6.png)


================================================
FILE: docs/tla-intro.md
================================================
# Nezha TLA+

This repository includes a model-checked TLA+ specification (both the source file and the pdf version) for Nezha protocol. Besides, we also include a document to explain Nezha's recovery in pseudo-code. 


================================================
FILE: external/gogoprotobuf.BUILD
================================================
package(default_visibility=['//visibility:public'])

proto_library(
    name = "gogo_proto",
    srcs = ["gogoproto/gogo.proto"],
    deps = ["@com_google_protobuf//:descriptor_proto"]
)

================================================
FILE: external/googleapi.BUILD
================================================
package(default_visibility=['//visibility:public'])

proto_library(
name = 'annotations_proto',
srcs = ['google/api/annotations.proto'],
deps = [
        ":http_proto",
        "@com_google_protobuf//:descriptor_proto"
    ],
)

proto_library(
    name = 'http_proto',
    srcs = ['google/api/http.proto']
)

================================================
FILE: lib/BUILD
================================================
load("@rules_proto//proto:defs.bzl", "proto_library")

cc_library(
    name = "zipfian",
    srcs = ["zipfian.h"],
    hdrs = ["zipfian.h"],
    visibility = ["//visibility:public"],
)


cc_library(
    name = "common_type",
    srcs = ["common_type.h"],
    hdrs = ["common_type.h"],
    visibility = ["//visibility:public"],
)


cc_library(
    name = "common_struct",
    srcs = ["common_struct.h"],
    hdrs = ["common_struct.h"],
    visibility = ["//visibility:public"],
    deps = [
        ":common_type",
    ],
)


cc_library(
    name = "address",
    srcs = ["address.cc"],
    hdrs = ["address.h"],
    visibility = ["//visibility:public"],
)

cc_library(
    name = "message_handler",
    srcs = ["message_handler.h"],
    hdrs = ["message_handler.h"],
    visibility = ["//visibility:public"],
    deps = [
        ":address",
        ":common_type",
    ],
)

cc_library(
    name = "timer",
    srcs = ["timer.h"],
    hdrs = ["timer.h"],
    visibility = ["//visibility:public"],
    deps = [
        ":address",
        ":common_type",
    ],
)

cc_library(
    name = "endpoint",
    srcs = ["endpoint.cc"],
    hdrs = ["endpoint.h"],
    visibility = ["//visibility:public"],
    deps = [
        ":address",
        ":common_struct",
        ":message_handler",
        ":timer",
        "@com_github_enki_libev//:libev",
        "@com_github_google_glog//:glog",
        "@com_google_protobuf//:protobuf",
        "@openssl//:openssl",
    ],
)




cc_library(
    name = "udp_socket_endpoint",
    srcs = ["udp_socket_endpoint.cc"],
    hdrs = ["udp_socket_endpoint.h"],
    visibility = ["//visibility:public"],
    deps = [
        ":address",
        ":endpoint",
        "@com_github_enki_libev//:libev",
        "@com_google_protobuf//:protobuf",
        "@openssl//:openssl",
    ],
)


cc_library(
    name = "utils",
    srcs = ["utils.cc"],
    hdrs = ["utils.h"],
    deps = [
        ":udp_socket_endpoint",
        "@com_github_cameron314_concurrentqueue//:concurrentqueue",
        "@com_github_preshing_junction//:libjunction",
        "@com_github_gflags_gflags//:gflags",
        "@com_github_google_glog//:glog",
        "@openssl//:openssl",
    ],
    visibility = ["//visibility:public"],
)



================================================
FILE: lib/Rules.mk
================================================
d := $(dir $(lastword $(MAKEFILE_LIST)))

SRCS += $(addprefix $(d), \
	address.cc utils.cc udp_socket_endpoint.cc)

LIB-address :=  $(o)address.o

LIB-utils := $(o)utils.o

LIB-udp-socket := $(o)udp_socket_endpoint.o $(LIB-address) $(LIB-utils)


$(info LIB-udp-socket is $(LIB-udp-socket)) 

# include $(d)tests/Rules.mk

================================================
FILE: lib/address.cc
================================================
#include "lib/address.h"

Address::Address() : ip_(""), port_(-1), mac_("") {
  bzero(&addr_, sizeof(addr_));
}
Address::Address(const std::string& ip, const int port, const std::string& mac)
    : ip_(ip), port_(port), mac_(mac) {
  bzero(&addr_, sizeof(addr_));
  addr_.sin_family = AF_INET;
  addr_.sin_port = htons(port);
  addr_.sin_addr.s_addr = inet_addr(ip.c_str());
}
Address::~Address() {}

std::string Address::GetIPAsString() {
  ip_ = inet_ntoa(addr_.sin_addr);
  return ip_;
}

int Address::GetPortAsInt() {
  port_ = htons(addr_.sin_port);
  return port_;
}

================================================
FILE: lib/address.h
================================================
#ifndef NEZHA_ADDRESS
#define NEZHA_ADDRESS
#include <arpa/inet.h>
#include <netinet/in.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <unistd.h>
#include <cstring>
#include <string>

#define UDP_BUFFER_SIZE (512)

/**
 * The address of an endpoint is encapsulate as the Address Class.
 * Now it mainly includes the socket-based information, such as ip and port, but
 * we reserves the future possibility to extend to support other communication
 * primitives, such as DPDK
 */

class Address {
 public:
  std::string ip_;
  int port_;
  std::string mac_;  // For future extension (DPDK)
  struct sockaddr_in addr_;

  Address();
  Address(const Address& addr)
      : ip_(addr.ip_), port_(addr.port_), mac_(addr.mac_) {
    memcpy(&addr_, &(addr.addr_), sizeof(struct sockaddr_in));
  }
  Address(const std::string& ip, const int port, const std::string& mac = "");
  ~Address();

  std::string GetIPAsString();
  int GetPortAsInt();
};

#endif

================================================
FILE: lib/common_struct.h
================================================


#ifndef NEZHA_COMMON_STRUCT_H
#define NEZHA_COMMON_STRUCT_H
#include <openssl/sha.h>
#include <stdio.h>
#include <stdlib.h>
#include <cstring>
#include <string>
#include <vector>
#include "lib/common_type.h"

/**
 * Nezha relies on proto messages to communicate.
 * When the proto message has been serialized and is about to be sent by the
 * endpoint, MessageHeader is prepended to the head of the proto message (refer
 * to SendMsgTo in udp_socket_endpoint.h), which describes the type of proto
 * message and its length. In this way, when the receiver endpoint receives the
 * message, it can know the type and length of the proto message, then it can
 * choose the proper way to deserialize it.
 */
struct MessageHeader {
  char msgType;
  uint32_t msgLen;
  MessageHeader(const char t, const uint32_t l) : msgType(t), msgLen(l) {}
};

/**
 * SHA_HASH is included in the FastReply message to represent the replica state
 * of replica. More details at Sec 5.2 of our paper
 * https://arxiv.org/pdf/2206.03285.pdf
 */
union SHA_HASH {
  uint32_t item[5];
  unsigned char hash[SHA_DIGEST_LENGTH];
  SHA_HASH() { memset(item, 0, sizeof(uint32_t) * 5); }
  SHA_HASH(const char* str, const uint32_t len) {
    if (len >= SHA_DIGEST_LENGTH) {
      memcpy(hash, str, SHA_DIGEST_LENGTH);
    } else {
      memcpy(hash, str, len);
    }
  }
  SHA_HASH(const SHA_HASH& h) { memcpy(item, h.item, sizeof(uint32_t) * 5); }
  SHA_HASH& operator=(const SHA_HASH& sh) {
    memcpy(item, sh.item, sizeof(uint32_t) * 5);
    return *this;
  }
  void XOR(const SHA_HASH& h) {
    item[0] ^= h.item[0];
    item[1] ^= h.item[1];
    item[2] ^= h.item[2];
    item[3] ^= h.item[3];
    item[4] ^= h.item[4];
  }
  std::string toString() {
    return (std::to_string(item[0]) + "-" + std::to_string(item[1]) + "-" +
            std::to_string(item[2]) + "-" + std::to_string(item[3]) + "-" +
            std::to_string(item[4]));
  }
};

/** When request is received by the replica, it will be first converted to
 * RequestBody, which includes all the useful information of the request */
struct RequestBody {
  uint64_t deadline;
  uint64_t reqKey;  // reqKey uniquely identifies the request on this replica,
                    // it is concated by the clientId and reqId. With reqKey,
                    // the replica can easily check whether this request has
                    // been previously received or not.
  uint32_t opKey;   // opKey indicates which key the request is operating on (
                    // imagine we are working on a database system and different
                    // requests wil read/write different keys). opKey is
                    // important for commutativity optimization. dd
  uint64_t proxyId;     // proxyId indicates which proxy delivers the request to
                        // the replica, and later replicas will send the
                        // corresponding reply to the proxy.
  std::string command;  // command is the content to execute
  bool isWrite;
  RequestBody() {}
  RequestBody(const uint64_t d, const uint64_t r, const uint32_t ok,
              const uint64_t p, const std::string& cmd, const bool isw)
      : deadline(d),
        reqKey(r),
        opKey(ok),
        proxyId(p),
        command(cmd),
        isWrite(isw) {}

  /** The following methods are used to compare different requests so as to
   * decide their order*/
  bool LessThan(const RequestBody& bigger) {
    return (deadline < bigger.deadline ||
            (deadline == bigger.deadline && reqKey < bigger.reqKey));
  }
  bool LessThan(const std::pair<uint64_t, uint64_t>& bigger) {
    return (deadline < bigger.first ||
            (deadline == bigger.first && reqKey < bigger.second));
  }
  bool LessOrEqual(const RequestBody& bigger) {
    return (deadline < bigger.deadline ||
            (deadline == bigger.deadline && reqKey <= bigger.reqKey));
  }
  bool LessOrEqual(const std::pair<uint64_t, uint64_t>& bigger) {
    return (deadline < bigger.first ||
            (deadline == bigger.first && reqKey <= bigger.second));
  }
};

/**
 * After RequestBody is processed and eventually replied, it will be converted
 * into a LogEntry, and stored in the replica.
 * LogEntry, compares with RequestBody, includes more information
 */
struct LogEntry {
  // Request Body
  RequestBody body;
  SHA_HASH entryHash;  // The hash value of this **single** entry
  SHA_HASH logHash;  // The accumulative hash, which is calculated based on all
                     // the log entries from the beginning to this entry

  /** prevNonCommutative and nextNonCommutative organize the LogEntries as a
   * skiplist, and easier and more efficient to traverse/modify/delete */
  LogEntry* prevNonCommutative;  // The previous non-commutative entry
  LogEntry* nextNonCommutative;  // The next non-commutative entry

  LogEntry* prevNonCommutativeWrite;  // The entry's prevNonCommutative may be a
                                      // write, or may be a read
  // But only the prevNonCommutativeWrite is used to calculate the incremental
  // hash, see Sec 8.2 of Nezha's Technical Report
  LogEntry* nextNonCommutativeWrite;

  /** prev and next organizes the LogEntries as a link list, and easier to
   * traverse/modify/delete */

  LogEntry* prev;  // The previous LogEntry pointer
  LogEntry* next;  // The next LogEntry pointer

  std::string result;  // The execution result of the LogEntry
  char status;         //
  uint32_t logId;  // The logId (the position of the LogEntry in the list) of
                   // the entry

  LogEntry()
      : prevNonCommutative(NULL),
        nextNonCommutative(NULL),
        prevNonCommutativeWrite(NULL),
        nextNonCommutativeWrite(NULL),
        prev(NULL),
        next(NULL),
        result(""),
        status(EntryStatus::INITIAL),
        logId(0) {}
  LogEntry(const RequestBody& rb, const SHA_HASH& eh, const SHA_HASH& h,
           LogEntry* prevNonComm = NULL, LogEntry* nextNonComm = NULL,
           LogEntry* preNonCOmmW = NULL, LogEntry* nextNonCommW = NULL,
           LogEntry* pre = NULL, LogEntry* nxt = NULL,
           const std::string& re = "", const char sts = EntryStatus::INITIAL,
           const uint32_t lid = 0)
      : body(rb),
        entryHash(eh),
        logHash(h),
        prevNonCommutative(prevNonComm),
        nextNonCommutative(nextNonComm),
        prevNonCommutativeWrite(preNonCOmmW),
        nextNonCommutativeWrite(nextNonCommW),
        prev(pre),
        next(nxt),
        result(re),
        status(sts),
        logId(lid) {}
  LogEntry(const uint64_t d, const uint64_t r, const uint32_t ok,
           const uint64_t p, const std::string& cmd, const bool& isw,
           const SHA_HASH& eh, const SHA_HASH& h, LogEntry* prevNonComm = NULL,
           LogEntry* nextNonComm = NULL, LogEntry* preNonCOmmW = NULL,
           LogEntry* nextNonCommW = NULL, LogEntry* pre = NULL,
           LogEntry* nxt = NULL, const std::string& re = "",
           const char sts = EntryStatus::INITIAL, const uint32_t lid = 0)
      : body(d, r, ok, p, cmd, isw),
        entryHash(eh),
        logHash(h),
        prevNonCommutative(prevNonComm),
        nextNonCommutative(nextNonComm),
        prevNonCommutativeWrite(preNonCOmmW),
        nextNonCommutativeWrite(nextNonCommW),
        prev(pre),
        next(nxt),
        result(re),
        status(sts),
        logId(lid) {}

  bool LessThan(const LogEntry& bigger) { return body.LessThan(bigger.body); }
  bool LessThan(const std::pair<uint64_t, uint64_t>& bigger) {
    return body.LessThan(bigger);
  }
  bool LessOrEqual(const LogEntry& bigger) {
    return body.LessOrEqual(bigger.body);
  }
  bool LessOrEqual(const std::pair<uint64_t, uint64_t>& bigger) {
    return body.LessOrEqual(bigger);
  }
};

/**
 * CrashVectorStruct is necessary for Nezha to avoid stray messages, details in
 * Appendix A.1 and Appendix J of our paper
 */
struct CrashVectorStruct {
  std::vector<uint32_t> cv_;
  uint32_t version_;  // Newer crash vector will have a larger version_
  SHA_HASH cvHash_;
  CrashVectorStruct(const std::vector<uint32_t>& c, const uint32_t v)
      : cv_(c), version_(v) {
    const uint32_t contentLen = c.size() * sizeof(uint32_t);
    const unsigned char* content = (const unsigned char*)(void*)(c.data());
    SHA1(content, contentLen, cvHash_.hash);
  }
  CrashVectorStruct(const CrashVectorStruct& c)
      : cv_(c.cv_), version_(c.version_), cvHash_(c.cvHash_) {}
};

#endif

================================================
FILE: lib/common_type.h
================================================
#ifndef NEZHA_COMMON_TYPE_H
#define NEZHA_COMMON_TYPE_H

/** We currently only support UDP endpoint, and GRPC endpoint will be supported
 * in the near future*/
enum EndpointType {
  UDP_ENDPOINT = 1,
  GRPC_ENDPOINT  // To be supported
};

/** Refer to Sec 5 of our paper for detailed explanation of different replica
 * statuses */
enum ReplicaStatus { NORMAL = 1, VIEWCHANGE, RECOVERING, TERMINATED };

/** A LogEntry is INITIAL at the beginning, then it may switch to either
 * IN_PROCESS->PROCESSED->REPLIED  or directly IN_LATEBUFFER */
enum EntryStatus {
  INITIAL = 1,
  IN_PROCESS,
  IN_LATEBUFFER,
  PROCESSED,
  TO_SLOW_REPLY,
  REPLIED
};

/**
 * The message types are defined according to the proto files and the
 * information will be included in each message to facilitate
 * serialize/deserialize proto messages
 */
enum MessageType {
  CLIENT_REQUEST = 1,
  LEADER_REQUEST,
  SYNC_INDEX,
  MISSED_INDEX_ASK,
  MISSED_REQ_ASK,
  FAST_REPLY,
  SLOW_REPLY,
  COMMIT_REPLY,
  MISSED_REQ,
  VIEWCHANGE_REQ,
  VIEWCHANGE_MSG,
  START_VIEW,
  STATE_TRANSFER_REQUEST,
  STATE_TRANSFER_REPLY,
  CRASH_VECTOR_REQUEST,
  CRASH_VECTOR_REPLY,
  RECOVERY_REQUEST,
  RECOVERY_REPLY,
  SYNC_STATUS_REPORT,
  COMMIT_INSTRUCTION,
  SUSPEND_REPLY,
  ERROR_MSG
};

#endif

================================================
FILE: lib/endpoint.cc
================================================
#include "lib/endpoint.h"

Endpoint::Endpoint(const std::string& sip, const int sport,
                   const bool isMasterReceiver)
    : addr_(sip, sport) {
  evLoop_ = isMasterReceiver ? ev_default_loop() : ev_loop_new();
  if (!evLoop_) {
    LOG(ERROR) << "Event Loop error";
    return;
  }
}

Endpoint::~Endpoint() {
  LoopBreak();
  ev_loop_destroy(evLoop_);
}

bool Endpoint::RegisterTimer(Timer* timer) {
  if (evLoop_ == NULL) {
    LOG(ERROR) << "No evLoop!";
    return false;
  }

  if (isTimerRegistered(timer)) {
    LOG(ERROR) << "This timer has already been registered";
    return false;
  }

  timer->attachedEndpoint_ = this;
  eventTimers_.insert(timer);
  ev_timer_again(evLoop_, timer->evTimer_);
  return true;
}

bool Endpoint::UnRegisterTimer(Timer* timer) {
  if (evLoop_ == NULL) {
    LOG(ERROR) << "No evLoop!";
    return false;
  }
  if (!isTimerRegistered(timer)) {
    LOG(ERROR) << "The timer has not been registered ";
    return false;
  }
  ev_timer_stop(evLoop_, timer->evTimer_);
  eventTimers_.erase(timer);
  return true;
}

void Endpoint::UnRegisterAllTimers() {
  for (auto& t : eventTimers_) {
    ev_timer_stop(evLoop_, t->evTimer_);
  }
  eventTimers_.clear();
}

bool Endpoint::isTimerRegistered(Timer* timer) {
  return (eventTimers_.find(timer) != eventTimers_.end());
}

void Endpoint::LoopRun() { ev_run(evLoop_, 0); }

void Endpoint::LoopBreak() {
  UnRegisterAllTimers();
  ev_break(evLoop_, EVBREAK_ALL);
}


================================================
FILE: lib/endpoint.h
================================================
#ifndef NEZHA_ENDPOINT_H
#define NEZHA_ENDPOINT_H

#include <arpa/inet.h>
#include <ev.h>
#include <fcntl.h>
#include <glog/logging.h>
#include <google/protobuf/message.h>
#include <netinet/in.h>
#include <functional>
#include <set>
#include <string>
#include "lib/address.h"
#include "lib/common_struct.h"
#include "lib/message_handler.h"
#include "lib/timer.h"

/**
 * Endpoint is the basic abstraction, and it can be derived to more specific
 * endpoints, based on the communication primtive (e.g., UDPSocketEndpoint)
 *
 * An Endpoint supports three major functionalities:
 * (1) Send/Receive messages;
 * (2) Process the received messages according to (pre-registered) customized
 * message handlers;
 * (3) Conduct periodical actions according to (pre-registered)
 * customized timer functions.
 */
class Endpoint {
 protected:
  /* The address of this endpoint */
  Address addr_;
  /** The socket fd it uses to send/recv messages */
  int fd_;
  /** The ev_loop struct from libev, which uses to handle io/timer events */
  struct ev_loop* evLoop_;
  /** One endpoint can have multiple timers registered. We maintain a set to
   * avoid duplicate registration and check whether a specific timer has been
   * registered or not.*/
  std::set<struct Timer*> eventTimers_;

 public:
  int epId_;  // The id of the endpoint, mainly for debug
  /** The endpoint accepts an ip and port. If both are valid, it binds the
   * socket fd to the ip:port. If isMasterReceiver is true, it creates the
   * default loop with libev, otherwise, it creates new loop (refer to libev
   * documentation for detailed explanation at
   * https://metacpan.org/dist/EV/view/libev/ev.pod) */
  Endpoint(const std::string& ip = "", const int port = -1,
           const bool isMasterReceiver = false);
  virtual ~Endpoint();

  /** Send the message to the specific destination. The method needs to know the
   * message type (3rd parameter) and include such information in the buffer */
  virtual int SendMsgTo(const Address& dstAddr,
                        const google::protobuf::Message& msg,
                        const char msgType) = 0;

  /** An endpoint potentially can have multiple message handlers registered, but
   * our UDPSocketEndpoint implementation only supports at most one
   * message handler for one endpoint. So we make them as virtual functions and
   * different derived classes have their own implementation of the methods */
  virtual bool RegisterMsgHandler(MessageHandler* msgHdl) = 0;
  virtual bool UnRegisterMsgHandler(MessageHandler* msgHdl) = 0;
  virtual bool isMsgHandlerRegistered(MessageHandler* msgHdl) = 0;
  virtual void UnRegisterAllMsgHandlers() = 0;

  /** Return true if the timer is successfully registered, otherwise (e.g. it
   * has been registered before and has not been unreigstered), return false */
  bool RegisterTimer(Timer* timer);
  /** Return true if the timer is successfully registered, otherwise (e.g. the
   * timer has not been registered before), return false */
  bool UnRegisterTimer(Timer* timer);
  /** Check whether the timer has been registered */
  bool isTimerRegistered(Timer* timer);
  void UnRegisterAllTimers();

  void LoopRun();
  void LoopBreak();
};

#endif

================================================
FILE: lib/message_handler.h
================================================

#ifndef NEZHA_MESSAGE_HANDLER_H
#define NEZHA_MESSAGE_HANDLER_H

#include <arpa/inet.h>
#include <ev.h>
#include <fcntl.h>
#include <glog/logging.h>
#include <google/protobuf/message.h>
#include <netinet/in.h>
#include <functional>
#include <set>
#include <string>
#include "lib/address.h"
#include "lib/common_type.h"

/**
 * MessageHandler is an encapsulation of libev-based message handler (i.e.
 * ev_io).
 *
 * After the message handler is created, it will be registered to a
 * specific endpoint. Then, the callback func (i.e., MessageHandlerFunc) will be
 * called every time this endpoint receives some messages.
 *
 * Currently, we only support UDP communication. Therefore, we only have one
 * derived struct (UDPMsgHandler) from MessageHandler
 *
 * We will continue to support other types of endpoints. Correspondingly, there
 * will be more derived struct added later
 * **/

/**
 * Para-1: MessageHeader* describes the type and length of the received message
 * Para-2: char* is the payload of the message
 * Para-3: Address* is the address of the sender
 * Para-4: void* points to the (optional) context that is needed by the callback
 * function(i.e., MessageHandlerFunc)
 */
typedef std::function<void(MessageHeader*, char*, Address*, void*)>
    MessageHandlerFunc;

struct MessageHandler {
  MessageHandlerFunc msgHandler_;
  void* context_;
  Address sender_;
  struct ev_io* evWatcher_;
  MessageHandler(MessageHandlerFunc msghdl, void* ctx = NULL)
      : msgHandler_(msghdl), context_(ctx) {
    evWatcher_ = new ev_io();
    evWatcher_->data = (void*)this;
  }
  ~MessageHandler() { delete evWatcher_; }
};

struct UDPMsgHandler : MessageHandler {
  char buffer_[UDP_BUFFER_SIZE];
  UDPMsgHandler(MessageHandlerFunc msghdl, void* ctx = NULL)
      : MessageHandler(msghdl, ctx) {
    ev_init(evWatcher_, [](struct ev_loop* loop, struct ev_io* w, int revents) {
      UDPMsgHandler* m = (UDPMsgHandler*)(w->data);
      socklen_t sockLen = sizeof(struct sockaddr_in);
      int msgLen = recvfrom(w->fd, m->buffer_, UDP_BUFFER_SIZE, 0,
                            (struct sockaddr*)(&(m->sender_.addr_)), &sockLen);
      if (msgLen > 0 && (uint32_t)msgLen > sizeof(MessageHeader)) {
        MessageHeader* msgHeader = (MessageHeader*)(void*)(m->buffer_);
        if (msgHeader->msgLen + sizeof(MessageHeader) >= (uint32_t)msgLen) {
          m->msgHandler_(msgHeader, m->buffer_ + sizeof(MessageHeader),
                         &(m->sender_), m->context_);
        }
      }
    });
  }
  ~UDPMsgHandler() {}
};

#endif

================================================
FILE: lib/message_type.cc
================================================
#include "lib/message_type.h"



namespace MessageType {
    char CLIENT_REQUEST = 1;
    char LEADER_REQUEST = 2;
    char SYNC_INDEX = 3;
    char MISSED_INDEX_ASK = 4;
    char MISSED_REQ_ASK = 5;
    char FAST_REPLY = 6;
    char SLOW_REPLY = 7;
    char COMMIT_REPLY = 8;
    char MISSED_REQ = 9;
    char VIEWCHANGE_REQ = 10;
    char VIEWCHANGE = 11;
    char START_VIEW = 12;
    char STATE_TRANSFER_REQUEST = 13;
    char STATE_TRANSFER_REPLY = 14;
    char CRASH_VECTOR_REQUEST = 15;
    char CRASH_VECTOR_REPLY = 16;
    char RECOVERY_REQUEST = 17;
    char RECOVERY_REPLY = 18;
    char SYNC_STATUS_REPORT = 19;
    char COMMIT_INSTRUCTION = 20;
    char SUSPEND_REPLY = 21;
    char ERROR_MSG = 22;
};

================================================
FILE: lib/message_type.h
================================================
#include <stdint.h>
#ifndef NEZHA_MESSAGE_TYPE_H
#define NEZHA_MESSAGE_TYPE_H

#define CONCURRENT_MAP_START_INDEX (2u)
#define CONCAT_UINT32(a, b) ((((uint64_t)a)<<32u)|(uint32_t)b)
#define HIGH_32BIT(a) ((uint32_t)(a>>32))
#define LOW_32BIT(a) ((uint32_t)a)

struct MessageHeader {
    char msgType;
    uint32_t msgLen;
    MessageHeader(const char t, const uint32_t l) :msgType(t), msgLen(l) {}
};


namespace MessageType {
    extern char  CLIENT_REQUEST;
    extern char  LEADER_REQUEST;
    extern char  SYNC_INDEX;
    extern char  MISSED_INDEX_ASK;
    extern char  MISSED_REQ_ASK;
    extern char  FAST_REPLY;
    extern char  SLOW_REPLY;
    extern char  COMMIT_REPLY;
    extern char  MISSED_REQ;
    extern char  VIEWCHANGE_REQ;
    extern char  VIEWCHANGE;
    extern char  START_VIEW;
    extern char  STATE_TRANSFER_REQUEST;
    extern char  STATE_TRANSFER_REPLY;
    extern char  CRASH_VECTOR_REQUEST;
    extern char  CRASH_VECTOR_REPLY;
    extern char  RECOVERY_REQUEST;
    extern char  RECOVERY_REPLY;
    extern char  SYNC_STATUS_REPORT;
    extern char  COMMIT_INSTRUCTION;
    extern char  SUSPEND_REPLY;
    extern char  ERROR_MSG;
};

#endif

================================================
FILE: lib/timer.h
================================================
#ifndef NEZHA_TIMER_
#define NEZHA_TIMER_

#include <arpa/inet.h>
#include <ev.h>
#include <fcntl.h>
#include <glog/logging.h>
#include <google/protobuf/message.h>
#include <netinet/in.h>
#include <functional>
#include <set>
#include <string>
#include "lib/address.h"
#include "lib/common_type.h"

/**
 * Timer is an encapsulation of libev-based message handler (i.e.
 * ev_timer).
 *
 * After the timer is created, it will be registered to a
 * specific endpoint, together with a period (measures in milliseconds). Then,
 * the callback func (i.e., TimerFunc) will be called periodically until the
 * timer is unregistered
 * **/

/**
 * Para-1: The first void* points to the context, that may be needed by the
 * callback function(i.e., TimerFunc)
 * Para-2: The first void* points to the endpoint that this timer is attached
 * to. It can be passed into the function as NULL if the TimerFunc does not need
 * it. But some TimerFunc (e.g., monitorTimer in replica) callback needs to know
 * the endpoint it has attached to.
 */

typedef std::function<void(void*, void*)> TimerFunc;

struct Timer {
  std::function<void(void*, void*)> timerFunc_;
  void* context_;
  void* attachedEndpoint_;
  struct ev_timer* evTimer_;

  Timer(TimerFunc timerf, uint32_t periodMs = 1, void* ctx = NULL,
        void* aep = NULL)
      : timerFunc_(timerf), context_(ctx), attachedEndpoint_(aep) {
    evTimer_ = new ev_timer();
    evTimer_->data = (void*)this;
    evTimer_->repeat = periodMs * 1e-3;
    ev_init(evTimer_,
            [](struct ev_loop* loop, struct ev_timer* w, int revents) {
              Timer* t = (Timer*)(w->data);
              t->timerFunc_(t->context_, t->attachedEndpoint_);
            });
  }
  ~Timer() { delete evTimer_; }
};

#endif

================================================
FILE: lib/udp_socket_endpoint.cc
================================================
#include "lib/udp_socket_endpoint.h"

UDPSocketEndpoint::UDPSocketEndpoint(const std::string& ip, const int port,
                                     const bool isMasterReceiver)
    : Endpoint(ip, port, isMasterReceiver), msgHandler_(NULL) {
  fd_ = socket(PF_INET, SOCK_DGRAM, 0);
  if (fd_ < 0) {
    LOG(ERROR) << "Receiver Fd fail ";
    return;
  }
  // Set Non-Blocking
  int status = fcntl(fd_, F_SETFL, fcntl(fd_, F_GETFL, 0) | O_NONBLOCK);
  if (status < 0) {
    LOG(ERROR) << " Set NonBlocking Fail";
  }
  if (ip == "" || port < 0) {
    return;
  }
  struct sockaddr_in addr;
  bzero(&addr, sizeof(addr));
  addr.sin_family = AF_INET;
  addr.sin_port = htons(port);
  addr.sin_addr.s_addr = inet_addr(ip.c_str());
  // Bind socket to Address
  int bindRet = bind(fd_, (struct sockaddr*)&addr, sizeof(addr));
  if (bindRet != 0) {
    LOG(ERROR) << "bind error\t" << bindRet << "\t port=" << port;
    return;
  }
}

UDPSocketEndpoint::~UDPSocketEndpoint() {}

int UDPSocketEndpoint::SendMsgTo(const Address& dstAddr,
                                 const google::protobuf::Message& msg,
                                 char msgType) {
  char buffer[UDP_BUFFER_SIZE];
  MessageHeader* msgHdr = (MessageHeader*)(void*)buffer;
  msgHdr->msgType = msgType;
  std::string serializedString = msg.SerializeAsString();
  msgHdr->msgLen = serializedString.length();
  if (serializedString.length() + sizeof(MessageHeader) > UDP_BUFFER_SIZE) {
    LOG(ERROR) << "Msg too large " << (uint32_t)msgType
               << "\t length=" << serializedString.length();
    return -1;
  }
  if (msgHdr->msgLen > 0) {
    // Serialization succeed
    // Prepend MesageHeader to the serialized string
    memcpy(buffer + sizeof(MessageHeader), serializedString.c_str(),
           msgHdr->msgLen);
    int ret = sendto(fd_, buffer, msgHdr->msgLen + sizeof(MessageHeader), 0,
                     (struct sockaddr*)(&(dstAddr.addr_)), sizeof(sockaddr_in));
    if (ret < 0) {
      VLOG(1) << pthread_self() << "\tSend Fail ret =" << ret;
    }
    return ret;
  }
  return -1;
}

bool UDPSocketEndpoint::RegisterMsgHandler(MessageHandler* msgHdl) {
  UDPMsgHandler* udpMsgHdl = (UDPMsgHandler*)msgHdl;
  if (evLoop_ == NULL) {
    LOG(ERROR) << "No evLoop!";
    return false;
  }
  if (isMsgHandlerRegistered(msgHdl)) {
    LOG(ERROR) << "This msgHdl has already been registered";
    return false;
  }

  msgHandler_ = udpMsgHdl;
  ev_io_set(udpMsgHdl->evWatcher_, fd_, EV_READ);
  ev_io_start(evLoop_, udpMsgHdl->evWatcher_);

  return true;
}

bool UDPSocketEndpoint::UnRegisterMsgHandler(MessageHandler* msgHdl) {
  UDPMsgHandler* udpMsgHdl = (UDPMsgHandler*)msgHdl;
  if (evLoop_ == NULL) {
    LOG(ERROR) << "No evLoop!";
    return false;
  }
  if (!isMsgHandlerRegistered(udpMsgHdl)) {
    LOG(ERROR) << "The handler has not been registered ";
    return false;
  }
  ev_io_stop(evLoop_, udpMsgHdl->evWatcher_);
  msgHandler_ = NULL;
  return true;
}

bool UDPSocketEndpoint::isMsgHandlerRegistered(MessageHandler* msgHdl) {
  return (UDPMsgHandler*)msgHdl == msgHandler_;
}

void UDPSocketEndpoint::UnRegisterAllMsgHandlers() {
  ev_io_stop(evLoop_, msgHandler_->evWatcher_);
  msgHandler_ = NULL;
}


================================================
FILE: lib/udp_socket_endpoint.h
================================================
#ifndef NEZHA_UDP_SOCKET_SENDER_H
#define NEZHA_UDP_SOCKET_SENDER_H

#include "lib/endpoint.h"

class UDPSocketEndpoint : public Endpoint {
 private:
  /* data */
  struct UDPMsgHandler* msgHandler_;

 public:
  UDPSocketEndpoint(const std::string& ip = "", const int port = -1,
                    const bool isMasterReceiver = false);
  ~UDPSocketEndpoint();

  int SendMsgTo(const Address& dstAddr, const google::protobuf::Message& msg,
                const char msgType) override;
  bool RegisterMsgHandler(MessageHandler* msgHdl) override;
  bool UnRegisterMsgHandler(MessageHandler* msgHdl) override;
  bool isMsgHandlerRegistered(MessageHandler* msgHdl) override;
  void UnRegisterAllMsgHandlers() override;
};

#endif

================================================
FILE: lib/utils.cc
================================================
#include "lib/utils.h"

SHA_HASH CalculateHash(uint64_t deadline, uint64_t reqKey) {
  SHA_HASH hash;
  const uint32_t contentLen =
      sizeof(uint64_t) + sizeof(uint32_t) + sizeof(uint32_t);
  unsigned char content[contentLen];
  memcpy(content, &deadline, sizeof(uint64_t));
  memcpy(content + sizeof(uint64_t), &reqKey, sizeof(uint64_t));
  SHA1(content, contentLen, hash.hash);
  return hash;
}

// Get Current Microsecond Timestamp
uint64_t GetMicrosecondTimestamp() {
  auto tse = std::chrono::system_clock::now().time_since_epoch();
  return std::chrono::duration_cast<std::chrono::microseconds>(tse).count();
}

Endpoint* CreateEndpoint(const char endpointType, const std::string& sip,
                         const int sport, const bool isMasterReceiver) {
  if (endpointType == EndpointType::UDP_ENDPOINT) {
    return new UDPSocketEndpoint(sip, sport, isMasterReceiver);
  } else if (endpointType == EndpointType::GRPC_ENDPOINT) {
    // To support GRPC later
    return NULL;
  } else {
    LOG(ERROR) << "Unknown endpoint type: " << endpointType;
    return NULL;
  }
}

MessageHandler* CreateMsgHandler(const char endpointType,
                                 MessageHandlerFunc msghdl, void* ctx) {
  if (endpointType == EndpointType::UDP_ENDPOINT) {
    return new UDPMsgHandler(msghdl, ctx);
  } else if (endpointType == EndpointType::GRPC_ENDPOINT) {
    // To support GRPC later
    return NULL;
  } else {
    LOG(ERROR) << "Unknown endpoint type: " << endpointType;
    return NULL;
  }
}


================================================
FILE: lib/utils.h
================================================
#ifndef NEZHA_UTILS_H
#define NEZHA_UTILS_H

#include <arpa/inet.h>
#include <ev.h>
#include <glog/logging.h>
#include <junction/ConcurrentMap_Leapfrog.h>
#include <netinet/in.h>
#include <openssl/sha.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <unistd.h>
#include <chrono>
#include <cstring>
#include "concurrentqueue.h"
#include "gflags/gflags.h"
#include "lib/udp_socket_endpoint.h"

template <typename T1>
using ConcurrentQueue = moodycamel::ConcurrentQueue<T1>;
template <typename T1, typename T2>
using ConcurrentMap = junction::ConcurrentMap_Leapfrog<T1, T2>;

/** The concurrent map we used (i.e.junction::ConcurrentMap) reserves 0 and 1 ,
 * so the start value should be 2 */
#define CONCURRENT_MAP_START_INDEX (2u)
#define CONCAT_UINT32(a, b) ((((uint64_t)a) << 32u) | (uint32_t)b)
/** Get the high/low 32bits of a uint64 */
#define HIGH_32BIT(a) ((uint32_t)(a >> 32))
#define LOW_32BIT(a) ((uint32_t)a)

// Since <deadline, reqKey> is sufficient to uniquely identify one request, we
// calculate hash based on them to represent the corresponding request/log
SHA_HASH CalculateHash(uint64_t deadline, uint64_t reqKey);

// Get Current Microsecond Timestamp
uint64_t GetMicrosecondTimestamp();

// Factory function, to create different types of endpoints and msghandlers
Endpoint* CreateEndpoint(const char endpointType, const std::string& sip = "",
                         const int sport = -1,
                         const bool isMasterReceiver = false);

MessageHandler* CreateMsgHandler(
    const char endpointType,
    std::function<void(MessageHeader*, char*, Address*, void*)> msghdl,
    void* ctx = NULL);

#endif

================================================
FILE: lib/zipfian.h
================================================
/*
 * MIT License
 *
 * Copyright (c) 2017 Lucas Lersch
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

 /* Implementation derived from:
  * "Quickly Generating Billion-Record Synthetic Databases", Jim Gray et al,
  * SIGMOD 1994
  */

  /*
   * The zipfian_int_distribution class is intended to be compatible with other
   * distributions introduced in #include <random> by the C++11 standard.
   *
   * Usage example:
   * #include <random>
   * #include "zipfian_int_distribution.h"
   * int main()
   * {
   *   std::default_random_engine generator;
   *   zipfian_int_distribution<int> distribution(1, 10, 0.99);
   *   int i = distribution(generator);
   * }
   */

   /*
    * IMPORTANT: constructing the distribution object requires calculating the zeta
    * value which becomes prohibetively expensive for very large ranges. As an
    * alternative for such cases, the user can pass the pre-calculated values and
    * avoid the calculation every time.
    *
    * Usage example:
    * #include <random>
    * #include "zipfian_int_distribution.h"
    * int main()
    * {
    *   std::default_random_engine generator;
    *   zipfian_int_distribution<int>::param_type p(1, 1e6, 0.99, 27.000);
    *   zipfian_int_distribution<int> distribution(p);
    *   int i = distribution(generator);
    * }
    */

#include <cmath>
#include <limits>
#include <random>
#include <cassert>

template<typename _IntType = int>
class zipfian_int_distribution
{
    static_assert(std::is_integral<_IntType>::value, "Template argument not an integral type.");

public:
    /** The type of the range of the distribution. */
    typedef _IntType result_type;
    /** Parameter type. */
    struct param_type
    {
        typedef zipfian_int_distribution<_IntType> distribution_type;

        explicit param_type(_IntType __a = 0, _IntType __b = std::numeric_limits<_IntType>::max(), double __theta = 0.99)
            : _M_a(__a), _M_b(__b), _M_theta(__theta),
            _M_zeta(zeta(_M_b - _M_a + 1, __theta)), _M_zeta2theta(zeta(2, __theta))
        {
            assert(_M_a <= _M_b && _M_theta > 0.0 && _M_theta < 1.0);
        }

        explicit param_type(_IntType __a, _IntType __b, double __theta, double __zeta)
            : _M_a(__a), _M_b(__b), _M_theta(__theta), _M_zeta(__zeta),
            _M_zeta2theta(zeta(2, __theta))
        {
            __glibcxx_assert(_M_a <= _M_b && _M_theta > 0.0 && _M_theta < 1.0);
        }

        result_type	a() const { return _M_a; }

        result_type	b() const { return _M_b; }

        double theta() const { return _M_theta; }

        double zeta() const { return _M_zeta; }

        double zeta2theta() const { return _M_zeta2theta; }

        friend bool	operator==(const param_type& __p1, const param_type& __p2)
        {
            return __p1._M_a == __p2._M_a
                && __p1._M_b == __p2._M_b
                && __p1._M_theta == __p2._M_theta
                && __p1._M_zeta == __p2._M_zeta
                && __p1._M_zeta2theta == __p2._M_zeta2theta;
        }

    private:
        _IntType _M_a;
        _IntType _M_b;
        double _M_theta;
        double _M_zeta;
        double _M_zeta2theta;

        /**
         * @brief Calculates zeta.
         *
         * @param __n [IN]  The size of the domain.
         * @param __theta [IN]  The skew factor of the distribution.
         */
        double zeta(unsigned long __n, double __theta)
        {
            double ans = 0.0;
            for (unsigned long i = 1; i <= __n; ++i)
                ans += std::pow(1.0 / i, __theta);
            return ans;
        }
    };

public:
    /**
     * @brief Constructs a zipfian_int_distribution object.
     *
     * @param __a [IN]  The lower bound of the distribution.
     * @param __b [IN]  The upper bound of the distribution.
     * @param __theta [IN]  The skew factor of the distribution.
     */
    explicit zipfian_int_distribution(_IntType __a = _IntType(0), _IntType __b = _IntType(1), double __theta = 0.99)
        : _M_param(__a, __b, __theta)
    { }

    explicit zipfian_int_distribution(const param_type& __p) : _M_param(__p)
    { }

    /**
     * @brief Resets the distribution state.
     *
     * Does nothing for the zipfian int distribution.
     */
    void reset() { }

    result_type a() const { return _M_param.a(); }

    result_type b() const { return _M_param.b(); }

    double theta() const { return _M_param.theta(); }

    /**
     * @brief Returns the parameter set of the distribution.
     */
    param_type param() const { return _M_param; }

    /**
     * @brief Sets the parameter set of the distribution.
     * @param __param The new parameter set of the distribution.
     */
    void param(const param_type& __param) { _M_param = __param; }

    /**
     * @brief Returns the inclusive lower bound of the distribution range.
     */
    result_type min() const { return this->a(); }

    /**
     * @brief Returns the inclusive upper bound of the distribution range.
     */
    result_type max() const { return this->b(); }

    /**
     * @brief Generating functions.
     */
    template<typename _UniformRandomNumberGenerator>
    result_type operator()(_UniformRandomNumberGenerator& __urng)
    {
        return this->operator()(__urng, _M_param);
    }

    template<typename _UniformRandomNumberGenerator>
    result_type operator()(_UniformRandomNumberGenerator& __urng, const param_type& __p)
    {
        double alpha = 1 / (1 - __p.theta());
        double eta = (1 - std::pow(2.0 / (__p.b() - __p.a() + 1), 1 - __p.theta())) / (1 - __p.zeta2theta() / __p.zeta());

        double u = std::generate_canonical<double, std::numeric_limits<double>::digits, _UniformRandomNumberGenerator>(__urng);

        double uz = u * __p.zeta();
        if (uz < 1.0) return __p.a();
        if (uz < 1.0 + std::pow(0.5, __p.theta())) return __p.a() + 1;

        return __p.a() + ((__p.b() - __p.a() + 1) * std::pow(eta * u - eta + 1, alpha));
    }

    /**
     * @brief Return true if two zipfian int distributions have
     *        the same parameters.
     */
    friend bool operator==(const zipfian_int_distribution& __d1, const zipfian_int_distribution& __d2)
    {
        return __d1._M_param == __d2._M_param;
    }

private:
    param_type _M_param;
};


================================================
FILE: license.md
================================================
MIT License

Copyright (c) 2022-2024 Jinkun Geng

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

================================================
FILE: micro-bench/BUILD
================================================
cc_binary(
    name = "bench_sender",
    srcs = ["bench_sender.cc"],
    deps = [
        "//proto:nezha_cc_proto",
        "//lib:utils",
        "//lib:address",
        "//lib:zipfian",
    ],
    copts = [
        "-I/usr/local/include"
    ],
    linkopts = [ "-L/usr/local/lib", "-lev", "-ldl", "-lturf", "-ljunction", 
                "-lcrypto", "-lgflags",  "-lglog",  "-lyaml-cpp", "-pthread" ],

)


cc_binary(
    name = "bench_receiver",
    srcs = ["bench_receiver.cc"],
    deps = [
        "//proto:nezha_cc_proto",
        "//lib:utils",
        "//lib:address",
        "//lib:zipfian",
    ],
    copts = [
        "-I/usr/local/include"
    ],
    linkopts = [ "-L/usr/local/lib", "-lev", "-ldl", "-lturf", "-ljunction", 
                "-lcrypto", "-lgflags",  "-lglog",  "-lyaml-cpp", "-pthread" ],

)




cc_binary(
    name = "analysis",
    srcs = ["analysis.cc"],
    deps = [
        "//proto:nezha_cc_proto",
        "//lib:utils",
        "//lib:address",
        "//lib:zipfian",
    ],
    copts = [
        "-I/usr/local/include"
    ],
    linkopts = [ "-L/usr/local/lib", "-lev", "-ldl", "-lturf", "-ljunction", 
                "-lcrypto", "-lgflags",  "-lglog",  "-lyaml-cpp", "-pthread" ],

)



================================================
FILE: micro-bench/analysis.cc
================================================
#include <fstream>
#include <iostream>
#include "lib/utils.h"
#include "lib/zipfian.h"
#include "proto/nezha_proto.pb.h"

DEFINE_string(folder, "/home/steam1994/micro-stats/2-10000-0-50",
              "The folder of the csv");

DEFINE_int32(replica_num, 2, "The number of replicas");

int main(int argc, char* argv[]) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);
  google::InitGoogleLogging(argv[0]);
  FLAGS_logtostderr = 1;
  //   std::vector<uint32_t> zipfianKeys;
  //   uint32_t keyNum = 1000000;
  //   zipfianKeys.resize(1000000, 0);
  //   uint32_t skewFactor = 0.5;
  //   if (keyNum > 1) {
  //     std::default_random_engine generator(1);  // clientId as the seed
  //     zipfian_int_distribution<uint32_t> zipfianDistribution(0, keyNum - 1,
  //                                                            skewFactor);
  //     for (uint32_t i = 0; i < zipfianKeys.size(); i++) {
  //       zipfianKeys[i] = zipfianDistribution(generator);
  //     }
  //   }

  std::string r0Fname = FLAGS_folder + "/" + "Replica-Stats-0.csv";
  std::ifstream ifs1(r0Fname);
  LOG(INFO) << "fname=" << r0Fname;
  uint32_t clientId, reqId;
  uint32_t id = 0;
  std::map<uint64_t, uint32_t> mapIdx;
  std::map<uint64_t, uint64_t> mapKey;
  while (ifs1 >> clientId >> reqId) {
    uint64_t reqKey = CONCAT_UINT32(clientId, reqId);
    mapIdx[reqKey] = id;
    id++;
  }

  for (int i = 1; i < FLAGS_replica_num; i++) {
    std::string r1Fname =
        FLAGS_folder + "/" + "Replica-Stats-" + std::to_string(i) + ".csv";
    std::ifstream ifs2(r1Fname);
    std::vector<uint64_t> reqKeys;
    reqKeys.reserve(100000);
    std::vector<uint32_t> mappedIds;
    mappedIds.reserve(100000);
    while (ifs2 >> clientId >> reqId) {
      uint64_t reqKey = CONCAT_UINT32(clientId, reqId);
      reqKeys.push_back(reqKey);
      mappedIds.push_back(mapIdx[reqKey]);
    }
    uint32_t reorderedCase = 0;
    for (uint32_t i = 1; i < reqKeys.size(); i++) {
      if (mappedIds[i] == 0 || mappedIds[i] < mappedIds[i - 1]) {
        reorderedCase++;
      }
    }
    LOG(INFO) << "reorderedCase=" << reorderedCase << "\t"
              << "total=" << id << "\t rate=" << reorderedCase * 1.0 / id;
  }
}

================================================
FILE: micro-bench/bench_receiver.cc
================================================
#include <fstream>
#include <iostream>
#include "lib/utils.h"
#include "lib/zipfian.h"
#include "proto/nezha_proto.pb.h"
DEFINE_string(receiver_ip, "127.0.0.1", "The ip address of the receiver");

DEFINE_int32(receiver_port, 33333, "The port of the receiver");

DEFINE_int32(replica_id, 1, "The id of the replica");
DEFINE_int32(enable_dom, 0, "Whether enable DOM");
DEFINE_int32(percentile, 50, "The percentile of the owd estimation");

DEFINE_int32(client_port, 33336,
             "The port of the client listens for OWD reply");

ConcurrentMap<uint32_t, Address*> clientAddrs;
ConcurrentQueue<std::pair<uint32_t, uint32_t>> owdQu;
ConcurrentQueue<nezha::proto::Request> processQu;
std::vector<std::pair<uint32_t, uint32_t>> traceVec;
void MsgHandlerFunc(MessageHeader* msgHeader, char* msgBuffer, Address* sender,
                    void* context) {
  if (msgHeader->msgType == MessageType::CLIENT_REQUEST &&
      msgHeader->msgLen > 0) {
    nezha::proto::Request request;
    if (request.ParseFromArray(msgBuffer, msgHeader->msgLen)) {
      if (clientAddrs.get(request.clientid()) == NULL) {
        Address* senderAddr =
            new Address(sender->GetIPAsString(), FLAGS_client_port);
        clientAddrs.assign(request.clientid(), senderAddr);
      }
      processQu.enqueue(request);
      uint64_t nowTime = GetMicrosecondTimestamp();
      if (nowTime > request.sendtime()) {
        uint32_t owd = nowTime - request.sendtime();
        owdQu.enqueue({request.clientid(), owd});
      }
    }
  }
}
void ProcessTd() {
  traceVec.reserve(10000000ul);
  nezha::proto::Request request;
  std::map<std::pair<uint64_t, uint64_t>, nezha::proto::Request> earlyBuffer;
  uint64_t startTime = GetMicrosecondTimestamp();
  LOG(INFO) << "FLAGS_enable_dom=" << FLAGS_enable_dom;
  while (true) {
    if (FLAGS_enable_dom == 1) {
      if (processQu.try_dequeue(request)) {
        uint64_t deadline = request.sendtime() + request.bound();
        uint64_t reqKey = CONCAT_UINT32(request.clientid(), request.reqid());
        earlyBuffer.insert({{deadline, reqKey}, request});
      }
      uint64_t nowTime = GetMicrosecondTimestamp();
      while (earlyBuffer.empty() == false &&
             earlyBuffer.begin()->first.first <= nowTime) {
        traceVec.push_back({earlyBuffer.begin()->second.clientid(),
                            earlyBuffer.begin()->second.reqid()});
        earlyBuffer.erase(earlyBuffer.begin());
        if (traceVec.size() >= 10000000ul) {
          break;
        }
      }
    } else {
      while (processQu.try_dequeue(request)) {
        traceVec.push_back({request.clientid(), request.reqid()});
        if (traceVec.size() >= 10000000ul) {
          break;
        }
      }
    }
    uint64_t nowTime = GetMicrosecondTimestamp();

    if (nowTime - startTime >= 60 * 1000ul * 1000ul ||
        traceVec.size() >= 10000000ul) {
      LOG(INFO) << "To terminated ..." << traceVec.size();
      std::ofstream ofs("Replica-Stats-" + std::to_string(FLAGS_replica_id) +
                        ".csv");
      // ofs << "ClientID,ReqID" << std::endl;
      for (auto& p : traceVec) {
        ofs << p.first << "\t" << p.second << std::endl;
      }
      ofs.close();
      exit(0);
    }
  }
}

void OWDTd() {
  std::pair<uint32_t, uint32_t> owdSample;
  std::map<uint32_t, std::vector<uint32_t>> owdMap;
  std::map<uint32_t, uint32_t> owdCnt;
  UDPSocketEndpoint* replyEP = dynamic_cast<UDPSocketEndpoint*>(
      CreateEndpoint(EndpointType::UDP_ENDPOINT));
  nezha::proto::Reply reply;
  reply.set_replicaid(FLAGS_replica_id);
  while (true) {
    if (owdQu.try_dequeue(owdSample)) {
      uint32_t senderId = owdSample.first;
      uint32_t owd = owdSample.second;
      if (owdMap.find(senderId) == owdMap.end()) {
        owdMap[sen
Download .txt
gitextract_rknzum1x/

├── .github/
│   └── workflows/
│       └── build.yaml
├── .gitignore
├── .vscode/
│   └── settings.json
├── README.md
├── WORKSPACE
├── client/
│   ├── BUILD
│   ├── client.cc
│   ├── client.h
│   ├── client_config.h
│   └── client_run.cc
├── configs/
│   ├── dist/
│   │   ├── nezha-client-config.yaml
│   │   ├── nezha-proxy-config.yaml
│   │   ├── nezha-replica-config-0.yaml
│   │   ├── nezha-replica-config-1.yaml
│   │   ├── nezha-replica-config-2.yaml
│   │   └── nezha-replica-config.yaml
│   ├── local/
│   │   ├── nezha-client-config.yaml
│   │   ├── nezha-proxy-config.yaml
│   │   ├── nezha-replica-config-0.yaml
│   │   ├── nezha-replica-config-1.yaml
│   │   └── nezha-replica-config-2.yaml
│   ├── nezha-client-config-template.yaml
│   ├── nezha-proxy-config-template.yaml
│   └── nezha-replica-config-template.yaml
├── docs/
│   ├── Nezha.tla
│   ├── demo.md
│   └── tla-intro.md
├── external/
│   ├── gogoprotobuf.BUILD
│   └── googleapi.BUILD
├── lib/
│   ├── BUILD
│   ├── Rules.mk
│   ├── address.cc
│   ├── address.h
│   ├── common_struct.h
│   ├── common_type.h
│   ├── endpoint.cc
│   ├── endpoint.h
│   ├── message_handler.h
│   ├── message_type.cc
│   ├── message_type.h
│   ├── timer.h
│   ├── udp_socket_endpoint.cc
│   ├── udp_socket_endpoint.h
│   ├── utils.cc
│   ├── utils.h
│   └── zipfian.h
├── license.md
├── micro-bench/
│   ├── BUILD
│   ├── analysis.cc
│   ├── bench_receiver.cc
│   ├── bench_sender.cc
│   └── launch_micro.py
├── proto/
│   ├── BUILD
│   └── nezha_proto.proto
├── proxy/
│   ├── BUILD
│   ├── proxy.cc
│   ├── proxy.h
│   ├── proxy_config.h
│   └── proxy_run.cc
├── replica/
│   ├── BUILD
│   ├── replica.cc
│   ├── replica.h
│   ├── replica_config.h
│   └── replica_run.cc
├── scripts/
│   ├── analysis.py
│   ├── launch.py
│   ├── local_test.sh
│   └── ttcs-agent.cfg
├── third_party/
│   ├── concurrentqueue/
│   │   └── BUILD.bazel
│   ├── glog/
│   │   ├── BUILD.bazel
│   │   ├── BUILD.glog
│   │   └── glog.bzl
│   ├── junction/
│   │   ├── BUILD.bazel
│   │   └── junction.patch
│   ├── libev/
│   │   └── BUILD.bazel
│   ├── openssl/
│   │   └── BUILD.bazel
│   └── turf/
│       └── BUILD.bazel
└── ttcs-agent.cfg
Download .txt
SYMBOL INDEX (102 symbols across 30 files)

FILE: client/client.cc
  type nezha (line 3) | namespace nezha {

FILE: client/client.h
  function namespace (line 9) | namespace nezha {

FILE: client/client_config.h
  function catch (line 7) | struct ClientConfig {

FILE: client/client_run.cc
  function Terminate (line 4) | void Terminate(int para) {
  function main (line 7) | int main(int argc, char* argv[]) {

FILE: lib/address.h
  function class (line 20) | class Address {

FILE: lib/common_struct.h
  function else (line 22) | struct MessageHeader {
  function XOR (line 49) | void XOR(const SHA_HASH& h) {
  function LessThan (line 65) | struct RequestBody {
  function LessOrEqual (line 100) | bool LessOrEqual(const RequestBody& bigger) {
  function LessOrEqual (line 104) | bool LessOrEqual(const std::pair<uint64_t, uint64_t>& bigger) {
  function LessThan (line 193) | bool LessThan(const std::pair<uint64_t, uint64_t>& bigger) {
  function LessOrEqual (line 196) | bool LessOrEqual(const LogEntry& bigger) {
  function LessOrEqual (line 199) | bool LessOrEqual(const std::pair<uint64_t, uint64_t>& bigger) {
  type CrashVectorStruct (line 208) | struct CrashVectorStruct {

FILE: lib/common_type.h
  type EndpointType (line 6) | enum EndpointType {
  type ReplicaStatus (line 13) | enum ReplicaStatus { NORMAL = 1, VIEWCHANGE, RECOVERING, TERMINATED }
  type EntryStatus (line 17) | enum EntryStatus {
  type MessageType (line 31) | enum MessageType {

FILE: lib/endpoint.h
  function class (line 29) | class Endpoint {

FILE: lib/message_handler.h
  type std (line 39) | typedef std::function<void(MessageHeader*, char*, Address*, void*)>
  function MessageHandler (line 42) | struct MessageHandler {
  function MessageHandler (line 55) | struct UDPMsgHandler : MessageHandler {

FILE: lib/message_type.cc
  type MessageType (line 5) | namespace MessageType {

FILE: lib/message_type.h
  type MessageHeader (line 10) | struct MessageHeader {

FILE: lib/timer.h
  type std (line 35) | typedef std::function<void(void*, void*)> TimerFunc;
  type Timer (line 37) | struct Timer {

FILE: lib/udp_socket_endpoint.cc
  type sockaddr_in (line 19) | struct sockaddr_in
  type sockaddr (line 25) | struct sockaddr
  type sockaddr (line 53) | struct sockaddr

FILE: lib/udp_socket_endpoint.h
  function class (line 6) | class UDPSocketEndpoint : public Endpoint {

FILE: lib/utils.cc
  function SHA_HASH (line 3) | SHA_HASH CalculateHash(uint64_t deadline, uint64_t reqKey) {
  function GetMicrosecondTimestamp (line 15) | uint64_t GetMicrosecondTimestamp() {
  function Endpoint (line 20) | Endpoint* CreateEndpoint(const char endpointType, const std::string& sip,
  function MessageHandler (line 33) | MessageHandler* CreateMsgHandler(const char endpointType,

FILE: lib/zipfian.h
  type param_type (line 77) | struct param_type
  function zeta (line 127) | double zeta(unsigned long __n, double __theta)
  function explicit (line 148) | explicit zipfian_int_distribution(const param_type& __p) : _M_param(__p)
  function reset (line 156) | void reset() { }
  function param (line 173) | void param(const param_type& __param) { _M_param = __param; }

FILE: micro-bench/analysis.cc
  function main (line 12) | int main(int argc, char* argv[]) {

FILE: micro-bench/bench_receiver.cc
  function MsgHandlerFunc (line 21) | void MsgHandlerFunc(MessageHeader* msgHeader, char* msgBuffer, Address* ...
  function ProcessTd (line 41) | void ProcessTd() {
  function OWDTd (line 89) | void OWDTd() {
  function main (line 125) | int main(int argc, char* argv[]) {

FILE: micro-bench/bench_sender.cc
  function ReplyHandlerFunc (line 34) | void ReplyHandlerFunc(MessageHeader* msgHeader, char* msgBuffer,
  function OWDUpdate (line 53) | void OWDUpdate() {
  function main (line 63) | int main(int argc, char* argv[]) {

FILE: micro-bench/launch_micro.py
  function generate_ttcs_cfg_file (line 24) | def generate_ttcs_cfg_file(internal_ip, is_reference=False, use_ntp=False):
  function retry_proc_error (line 69) | def retry_proc_error(procs_list):
  function start_ttcs_node (line 79) | def start_ttcs_node(internal_ip, is_reference, use_ntp=False):
  function launch_ttcs (line 118) | def launch_ttcs(server_ip_list):
  function scp_files (line 132) | def scp_files(server_ip_list, local_path_to_file, remote_dir, to_remote):
  function run_command (line 197) | def run_command(server_ip_list, cmd, in_background=True):
  function create_instance (line 262) | def create_instance(instance_name,
  function del_instance_list (line 307) | def del_instance_list(instance_list, zone="us-central1-a"):
  function stop_instance_list (line 314) | def stop_instance_list(instance_list, zone="us-central1-a"):
  function start_instance_list (line 322) | def start_instance_list(instance_list, zone="us-central1-a"):

FILE: proxy/proxy.cc
  type nezha (line 3) | namespace nezha {
    type sockaddr_in (line 43) | struct sockaddr_in
    type sockaddr_in (line 79) | struct sockaddr_in
    type sockaddr (line 85) | struct sockaddr
    type sockaddr_in (line 174) | struct sockaddr_in
    type sockaddr (line 183) | struct sockaddr
    type sockaddr_in (line 269) | struct sockaddr_in
    type sockaddr (line 278) | struct sockaddr
    type sockaddr_in (line 300) | struct sockaddr_in
    type sockaddr (line 309) | struct sockaddr
    function Reply (line 351) | Reply* Proxy::isQuorumReady(std::vector<uint64_t>& replicaSyncedPoint,
    type sockaddr_in (line 423) | struct sockaddr_in
    type sockaddr (line 431) | struct sockaddr
    type sockaddr_in (line 449) | struct sockaddr_in
    type sockaddr_in (line 458) | struct sockaddr_in
    type sockaddr (line 463) | struct sockaddr
    type sockaddr_in (line 530) | struct sockaddr_in
    type sockaddr_in (line 531) | struct sockaddr_in

FILE: proxy/proxy.h
  function namespace (line 7) | namespace nezha {

FILE: proxy/proxy_config.h
  function catch (line 7) | struct ProxyConfig {

FILE: proxy/proxy_run.cc
  function Terminate (line 5) | void Terminate(int para) {
  function main (line 8) | int main(int argc, char* argv[]) {

FILE: replica/replica.cc
  type nezha (line 3) | namespace nezha {
    type timespec (line 653) | struct timespec
    type timespec (line 1070) | struct timespec
    type timespec (line 1117) | struct timespec
    type timespec (line 1125) | struct timespec

FILE: replica/replica.h
  function namespace (line 14) | namespace nezha {
  function class (line 55) | class Replica {

FILE: replica/replica_config.h
  function catch (line 7) | struct ReplicaConfig {

FILE: replica/replica_run.cc
  function Terminate (line 6) | void Terminate(int para) {
  function main (line 10) | int main(int argc, char* argv[]) {

FILE: scripts/analysis.py
  function throughput_apply_func (line 12) | def throughput_apply_func(group):
  function ThroughputAnalysis (line 18) | def ThroughputAnalysis(merge_df):

FILE: scripts/launch.py
  function generate_ttcs_cfg_file (line 24) | def generate_ttcs_cfg_file(internal_ip, is_reference=False, use_ntp=False):
  function retry_proc_error (line 69) | def retry_proc_error(procs_list):
  function start_ttcs_node (line 79) | def start_ttcs_node(internal_ip, is_reference, use_ntp=False):
  function launch_ttcs (line 118) | def launch_ttcs(server_ip_list):
  function scp_files (line 132) | def scp_files(server_ip_list, local_path_to_file, remote_dir, to_remote):
  function run_command (line 197) | def run_command(server_ip_list, cmd, in_background=True):
  function create_instance (line 262) | def create_instance(instance_name,
  function del_instance_list (line 307) | def del_instance_list(instance_list, zone="us-central1-a"):
  function stop_instance_list (line 314) | def stop_instance_list(instance_list, zone="us-central1-a"):
  function start_instance_list (line 322) | def start_instance_list(instance_list, zone="us-central1-a"):
Condensed preview — 78 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (446K chars).
[
  {
    "path": ".github/workflows/build.yaml",
    "chars": 1024,
    "preview": "# Bazel action to build & test specific targets.\nname: Bazel build\n\non:\n  push:\n    branches: [main]\n  pull_request:\n   "
  },
  {
    "path": ".gitignore",
    "chars": 21,
    "preview": "/.obj\n/.bin\n/bazel-*\n"
  },
  {
    "path": ".vscode/settings.json",
    "chars": 2399,
    "preview": "{\n    \"C_Cpp.formatting\": \"clangFormat\",\n    \"C_Cpp.clang_format_fallbackStyle\": \"{BasedOnStyle: Google, IncludeBlocks: "
  },
  {
    "path": "README.md",
    "chars": 8901,
    "preview": "# Nezha\n\n<img src=\"docs/nezha-img.jpeg\" width=\"200\">\n\n----\nNezha (哪吒) is a legendary figure in Chinese mythology. Nezha "
  },
  {
    "path": "WORKSPACE",
    "chars": 5360,
    "preview": "load(\"@bazel_tools//tools/build_defs/repo:http.bzl\", \"http_archive\")\nload(\"@bazel_tools//tools/build_defs/repo:git.bzl\","
  },
  {
    "path": "client/BUILD",
    "chars": 537,
    "preview": "load(\"@rules_proto//proto:defs.bzl\", \"proto_library\")\n\ncc_library(\n    name = \"client_config\",\n    hdrs = [\"client_confi"
  },
  {
    "path": "client/client.cc",
    "chars": 16770,
    "preview": "#include \"client/client.h\"\n\nnamespace nezha {\nClient::Client(const std::string& configFile) {\n  hop3s.reserve(500000);\n "
  },
  {
    "path": "client/client.h",
    "chars": 6094,
    "preview": "#include <yaml-cpp/yaml.h>\n#include <fstream>\n#include <iostream>\n#include \"client_config.h\"\n#include \"lib/utils.h\"\n#inc"
  },
  {
    "path": "client/client_config.h",
    "chars": 2503,
    "preview": "#include <glog/logging.h>\n#include <stdint.h>\n#include <yaml-cpp/yaml.h>\n#include <string>\n#include <vector>\n\nstruct Cli"
  },
  {
    "path": "client/client_run.cc",
    "chars": 486,
    "preview": "#include \"client/client.h\"\nDEFINE_string(config, \"nezhav2/config/nezha-client-config-0.yaml\", \"The config file for the c"
  },
  {
    "path": "configs/dist/nezha-client-config.yaml",
    "chars": 669,
    "preview": "---\nprint-config: true\nproxy-info:\n  proxy-ips:\n    - \"10.128.2.13\"\n  proxy-shards: 1\n  request-port-base: 32000\nclient-"
  },
  {
    "path": "configs/dist/nezha-proxy-config.yaml",
    "chars": 591,
    "preview": "---\nprint-config: true\n# Replica Info\nreplica-info:\n  replica-ips:\n    - \"10.128.2.10\"\n    - \"10.128.2.11\"\n    - \"10.128"
  },
  {
    "path": "configs/dist/nezha-replica-config-0.yaml",
    "chars": 2692,
    "preview": "---\nprint-config: true\nreplica-ips:\n  - \"10.128.2.10\"\n  - \"10.128.2.11\"\n  - \"10.128.2.12\"\nreplica-id: 0\nreceiver-shards:"
  },
  {
    "path": "configs/dist/nezha-replica-config-1.yaml",
    "chars": 2692,
    "preview": "---\nprint-config: true\nreplica-ips:\n  - \"10.128.2.10\"\n  - \"10.128.2.11\"\n  - \"10.128.2.12\"\nreplica-id: 1\nreceiver-shards:"
  },
  {
    "path": "configs/dist/nezha-replica-config-2.yaml",
    "chars": 2692,
    "preview": "---\nprint-config: true\nreplica-ips:\n  - \"10.128.2.10\"\n  - \"10.128.2.11\"\n  - \"10.128.2.12\"\nreplica-id: 2\nreceiver-shards:"
  },
  {
    "path": "configs/dist/nezha-replica-config.yaml",
    "chars": 2692,
    "preview": "---\nprint-config: true\nreplica-ips:\n  - \"10.128.2.10\"\n  - \"10.128.2.11\"\n  - \"10.128.2.12\"\nreplica-id: 0\nreceiver-shards:"
  },
  {
    "path": "configs/local/nezha-client-config.yaml",
    "chars": 824,
    "preview": "---\nclient-id: 1\nclient-ip: \"127.0.0.5\"\nendpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon"
  },
  {
    "path": "configs/local/nezha-proxy-config.yaml",
    "chars": 665,
    "preview": "---\n# Proxy Info\nproxy-endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]\nproxy-id: 1\npro"
  },
  {
    "path": "configs/local/nezha-replica-config-0.yaml",
    "chars": 3179,
    "preview": "---\nprint-config: true\nreplica-ips:\n  - \"127.0.0.1\"\n  - \"127.0.0.2\"\n  - \"127.0.0.3\"\nendpoint-type: 1 # 1 for UDP Endpoin"
  },
  {
    "path": "configs/local/nezha-replica-config-1.yaml",
    "chars": 3179,
    "preview": "---\nprint-config: true\nreplica-ips:\n  - \"127.0.0.1\"\n  - \"127.0.0.2\"\n  - \"127.0.0.3\"\nendpoint-type: 1 # 1 for UDP Endpoin"
  },
  {
    "path": "configs/local/nezha-replica-config-2.yaml",
    "chars": 3179,
    "preview": "---\nprint-config: true\nreplica-ips:\n  - \"127.0.0.1\"\n  - \"127.0.0.2\"\n  - \"127.0.0.3\"\nendpoint-type: 1 # 1 for UDP Endpoin"
  },
  {
    "path": "configs/nezha-client-config-template.yaml",
    "chars": 880,
    "preview": "---\nprint-config: true\nproxy-info:\n  proxy-ips:\n    - \"127.0.0.4\"\n  proxy-shards: 12\n  request-port-base: 32000\nclient-i"
  },
  {
    "path": "configs/nezha-proxy-config-template.yaml",
    "chars": 683,
    "preview": "---\nprint-config: true\n# Replica Info\nreplica-info:\n  replica-ips:\n    - \"127.0.0.1\"\n    - \"127.0.0.2\"\n    - \"127.0.0.3\""
  },
  {
    "path": "configs/nezha-replica-config-template.yaml",
    "chars": 3494,
    "preview": "---\nprint-config: true\nreplica-ips:\n  - \"127.0.0.1\"\n  - \"127.0.0.2\"\n  - \"127.0.0.3\"\nendpoint-type: 1 # 1 for UDP Endpoin"
  },
  {
    "path": "docs/Nezha.tla",
    "chars": 66584,
    "preview": "\n`^\\textbf{\\large N  TLA+ Specification}\\\\^' \n \n------------------------------ MODULE Nezha ----------------------------"
  },
  {
    "path": "docs/demo.md",
    "chars": 6597,
    "preview": "\n## One-Box Demo\nWe have prepared the configuration files in ```configs``` folder, these configuration files will be use"
  },
  {
    "path": "docs/tla-intro.md",
    "chars": 218,
    "preview": "# Nezha TLA+\n\nThis repository includes a model-checked TLA+ specification (both the source file and the pdf version) for"
  },
  {
    "path": "external/gogoprotobuf.BUILD",
    "chars": 186,
    "preview": "package(default_visibility=['//visibility:public'])\n\nproto_library(\n    name = \"gogo_proto\",\n    srcs = [\"gogoproto/gogo"
  },
  {
    "path": "external/googleapi.BUILD",
    "chars": 307,
    "preview": "package(default_visibility=['//visibility:public'])\n\nproto_library(\nname = 'annotations_proto',\nsrcs = ['google/api/anno"
  },
  {
    "path": "lib/BUILD",
    "chars": 2236,
    "preview": "load(\"@rules_proto//proto:defs.bzl\", \"proto_library\")\n\ncc_library(\n    name = \"zipfian\",\n    srcs = [\"zipfian.h\"],\n    h"
  },
  {
    "path": "lib/Rules.mk",
    "chars": 321,
    "preview": "d := $(dir $(lastword $(MAKEFILE_LIST)))\n\nSRCS += $(addprefix $(d), \\\n\taddress.cc utils.cc udp_socket_endpoint.cc)\n\nLIB-"
  },
  {
    "path": "lib/address.cc",
    "chars": 572,
    "preview": "#include \"lib/address.h\"\n\nAddress::Address() : ip_(\"\"), port_(-1), mac_(\"\") {\n  bzero(&addr_, sizeof(addr_));\n}\nAddress:"
  },
  {
    "path": "lib/address.h",
    "chars": 957,
    "preview": "#ifndef NEZHA_ADDRESS\n#define NEZHA_ADDRESS\n#include <arpa/inet.h>\n#include <netinet/in.h>\n#include <sys/socket.h>\n#incl"
  },
  {
    "path": "lib/common_struct.h",
    "chars": 8466,
    "preview": "\n\n#ifndef NEZHA_COMMON_STRUCT_H\n#define NEZHA_COMMON_STRUCT_H\n#include <openssl/sha.h>\n#include <stdio.h>\n#include <stdl"
  },
  {
    "path": "lib/common_type.h",
    "chars": 1268,
    "preview": "#ifndef NEZHA_COMMON_TYPE_H\n#define NEZHA_COMMON_TYPE_H\n\n/** We currently only support UDP endpoint, and GRPC endpoint w"
  },
  {
    "path": "lib/endpoint.cc",
    "chars": 1465,
    "preview": "#include \"lib/endpoint.h\"\n\nEndpoint::Endpoint(const std::string& sip, const int sport,\n                   const bool isM"
  },
  {
    "path": "lib/endpoint.h",
    "chars": 3223,
    "preview": "#ifndef NEZHA_ENDPOINT_H\n#define NEZHA_ENDPOINT_H\n\n#include <arpa/inet.h>\n#include <ev.h>\n#include <fcntl.h>\n#include <g"
  },
  {
    "path": "lib/message_handler.h",
    "chars": 2544,
    "preview": "\n#ifndef NEZHA_MESSAGE_HANDLER_H\n#define NEZHA_MESSAGE_HANDLER_H\n\n#include <arpa/inet.h>\n#include <ev.h>\n#include <fcntl"
  },
  {
    "path": "lib/message_type.cc",
    "chars": 714,
    "preview": "#include \"lib/message_type.h\"\n\n\n\nnamespace MessageType {\n    char CLIENT_REQUEST = 1;\n    char LEADER_REQUEST = 2;\n    c"
  },
  {
    "path": "lib/message_type.h",
    "chars": 1167,
    "preview": "#include <stdint.h>\n#ifndef NEZHA_MESSAGE_TYPE_H\n#define NEZHA_MESSAGE_TYPE_H\n\n#define CONCURRENT_MAP_START_INDEX (2u)\n#"
  },
  {
    "path": "lib/timer.h",
    "chars": 1753,
    "preview": "#ifndef NEZHA_TIMER_\n#define NEZHA_TIMER_\n\n#include <arpa/inet.h>\n#include <ev.h>\n#include <fcntl.h>\n#include <glog/logg"
  },
  {
    "path": "lib/udp_socket_endpoint.cc",
    "chars": 3206,
    "preview": "#include \"lib/udp_socket_endpoint.h\"\n\nUDPSocketEndpoint::UDPSocketEndpoint(const std::string& ip, const int port,\n      "
  },
  {
    "path": "lib/udp_socket_endpoint.h",
    "chars": 726,
    "preview": "#ifndef NEZHA_UDP_SOCKET_SENDER_H\n#define NEZHA_UDP_SOCKET_SENDER_H\n\n#include \"lib/endpoint.h\"\n\nclass UDPSocketEndpoint "
  },
  {
    "path": "lib/utils.cc",
    "chars": 1514,
    "preview": "#include \"lib/utils.h\"\n\nSHA_HASH CalculateHash(uint64_t deadline, uint64_t reqKey) {\n  SHA_HASH hash;\n  const uint32_t c"
  },
  {
    "path": "lib/utils.h",
    "chars": 1663,
    "preview": "#ifndef NEZHA_UTILS_H\n#define NEZHA_UTILS_H\n\n#include <arpa/inet.h>\n#include <ev.h>\n#include <glog/logging.h>\n#include <"
  },
  {
    "path": "lib/zipfian.h",
    "chars": 7324,
    "preview": "/*\n * MIT License\n *\n * Copyright (c) 2017 Lucas Lersch\n *\n * Permission is hereby granted, free of charge, to any perso"
  },
  {
    "path": "license.md",
    "chars": 1072,
    "preview": "MIT License\n\nCopyright (c) 2022-2024 Jinkun Geng\n\nPermission is hereby granted, free of charge, to any person obtaining "
  },
  {
    "path": "micro-bench/BUILD",
    "chars": 1233,
    "preview": "cc_binary(\n    name = \"bench_sender\",\n    srcs = [\"bench_sender.cc\"],\n    deps = [\n        \"//proto:nezha_cc_proto\",\n   "
  },
  {
    "path": "micro-bench/analysis.cc",
    "chars": 2197,
    "preview": "#include <fstream>\n#include <iostream>\n#include \"lib/utils.h\"\n#include \"lib/zipfian.h\"\n#include \"proto/nezha_proto.pb.h\""
  },
  {
    "path": "micro-bench/bench_receiver.cc",
    "chars": 5210,
    "preview": "#include <fstream>\n#include <iostream>\n#include \"lib/utils.h\"\n#include \"lib/zipfian.h\"\n#include \"proto/nezha_proto.pb.h\""
  },
  {
    "path": "micro-bench/bench_sender.cc",
    "chars": 4728,
    "preview": "#include <fstream>\n#include <iostream>\n#include \"lib/utils.h\"\n#include \"lib/zipfian.h\"\n#include \"proto/nezha_proto.pb.h\""
  },
  {
    "path": "micro-bench/launch_micro.py",
    "chars": 20567,
    "preview": "import os\nimport subprocess\nfrom subprocess import PIPE, Popen\nimport time\nimport ruamel.yaml\nfrom termcolor import colo"
  },
  {
    "path": "proto/BUILD",
    "chars": 297,
    "preview": "load(\"@rules_proto//proto:defs.bzl\", \"proto_library\")\n\n\nproto_library(\n    name = \"nezha_proto\",\n    srcs = [\"nezha_prot"
  },
  {
    "path": "proto/nezha_proto.proto",
    "chars": 3298,
    "preview": "syntax = \"proto3\";\npackage nezha.proto;\n\n\nmessage Request {\n    uint64 sendtime = 1;\n    uint32 bound=2;\n    uint32 clie"
  },
  {
    "path": "proxy/BUILD",
    "chars": 474,
    "preview": "cc_library(\n    name = \"proxy_config\",\n    hdrs = [\"proxy_config.h\"],\n    deps = [\n        \"@com_github_jbeder_yaml_cpp/"
  },
  {
    "path": "proxy/proxy.cc",
    "chars": 20088,
    "preview": "#include \"proxy/proxy.h\"\n\nnamespace nezha {\nProxy::Proxy(const std::string& configFile) {\n  std::string error = proxyCon"
  },
  {
    "path": "proxy/proxy.h",
    "chars": 6891,
    "preview": "#include <yaml-cpp/yaml.h>\n#include <fstream>\n#include \"lib/utils.h\"\n#include \"proto/nezha_proto.pb.h\"\n#include \"proxy_c"
  },
  {
    "path": "proxy/proxy_config.h",
    "chars": 2188,
    "preview": "#include <glog/logging.h>\n#include <stdint.h>\n#include <yaml-cpp/yaml.h>\n#include <string>\n#include <vector>\n\nstruct Pro"
  },
  {
    "path": "proxy/proxy_run.cc",
    "chars": 477,
    "preview": "#include \"proxy/proxy.h\"\nDEFINE_string(config, \"nezhav2/config/nezha-proxy-config-0.yaml\", \"The config file for the prox"
  },
  {
    "path": "replica/BUILD",
    "chars": 604,
    "preview": "cc_library(\n    name = \"replica_config\",\n    hdrs = [\"replica_config.h\"],\n    deps = [\n        \"@com_github_jbeder_yaml_"
  },
  {
    "path": "replica/replica.cc",
    "chars": 106717,
    "preview": "#include \"replica/replica.h\"\n\nnamespace nezha {\n// #define GJK_DEBUG\n#ifdef GJK_DEBUG\n#define ASSERT(x) assert(x)\n#else\n"
  },
  {
    "path": "replica/replica.h",
    "chars": 23946,
    "preview": "#ifndef NEZHA_REPLICA_H\n#define NEZHA_REPLICA_H\n\n#include <yaml-cpp/yaml.h>\n#include <boost/uuid/uuid.hpp>\n#include <boo"
  },
  {
    "path": "replica/replica_config.h",
    "chars": 4799,
    "preview": "#include <glog/logging.h>\n#include <stdint.h>\n#include <yaml-cpp/yaml.h>\n#include <string>\n#include <vector>\n\nstruct Rep"
  },
  {
    "path": "replica/replica_run.cc",
    "chars": 726,
    "preview": "#include \"replica/replica.h\"\nDEFINE_string(config, \"nezhav2/config/nezha-replica-config-0.yaml\", \"The config file for th"
  },
  {
    "path": "scripts/analysis.py",
    "chars": 4250,
    "preview": "import pandas as pd\nfrom IPython import embed; \nimport argparse\nimport datetime\n\nLOGIN_PATH = \"/home/steam1994\"\nFAST_REP"
  },
  {
    "path": "scripts/launch.py",
    "chars": 22596,
    "preview": "import os\nimport subprocess\nfrom subprocess import PIPE, Popen\nimport time\nimport ruamel.yaml\nfrom termcolor import colo"
  },
  {
    "path": "scripts/local_test.sh",
    "chars": 1368,
    "preview": "#!/bin/bash\nexport FLAGS_alsologtostderr=1\n\necho \"Launching replica 0...\"\n(./bazel-bin/replica/nezha_replica --config ./"
  },
  {
    "path": "scripts/ttcs-agent.cfg",
    "chars": 258,
    "preview": "management_address: \"10.128.2.15\"\nlog_dir: \"/var/opt/ttcs/log\"\nsubscription_mode: true\ncoordinator_address: \"c-gjk1994gj"
  },
  {
    "path": "third_party/concurrentqueue/BUILD.bazel",
    "chars": 136,
    "preview": "cc_library(\n    name = \"concurrentqueue\", \n    srcs = [\"concurrentqueue.h\"],\n    visibility = [\n        \"//visibility:pu"
  },
  {
    "path": "third_party/glog/BUILD.bazel",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "third_party/glog/BUILD.glog",
    "chars": 93,
    "preview": "licenses(['notice'])\n\nload('@//third_party/glog:glog.bzl', 'glog_library')\n\nglog_library('')\n"
  },
  {
    "path": "third_party/glog/glog.bzl",
    "chars": 4820,
    "preview": "\"\"\"glog library build rule.\"\"\"\n\nload(\"@rules_cc//cc:defs.bzl\", \"cc_library\")\n\ndef glog_library(name, namespace = \"google"
  },
  {
    "path": "third_party/junction/BUILD.bazel",
    "chars": 377,
    "preview": "load(\"@rules_foreign_cc//foreign_cc:cmake.bzl\", \"cmake\")\n\nfilegroup(\n    name = \"all_srcs\",\n    srcs = glob([\"**\"]),\n   "
  },
  {
    "path": "third_party/junction/junction.patch",
    "chars": 743,
    "preview": "diff --git a/CMakeLists.txt b/CMakeLists.txt\nindex 93cf495..686aa50 100644\n--- a/CMakeLists.txt\n+++ b/CMakeLists.txt\n@@ "
  },
  {
    "path": "third_party/libev/BUILD.bazel",
    "chars": 285,
    "preview": "load(\"@rules_foreign_cc//foreign_cc:configure.bzl\", \"configure_make\")\n\nfilegroup(\n    name = \"all_srcs\",\n    srcs = glob"
  },
  {
    "path": "third_party/openssl/BUILD.bazel",
    "chars": 450,
    "preview": "load(\"@rules_foreign_cc//foreign_cc:configure.bzl\", \"configure_make\")\n\nfilegroup(\n    name = \"all_srcs\",\n    srcs = glob"
  },
  {
    "path": "third_party/turf/BUILD.bazel",
    "chars": 161,
    "preview": "load(\"@rules_foreign_cc//foreign_cc:cmake.bzl\", \"cmake\")\n\nfilegroup(\n    name = \"all_srcs\",\n    srcs = glob([\"**\"]),\n   "
  },
  {
    "path": "ttcs-agent.cfg",
    "chars": 258,
    "preview": "management_address: \"10.128.2.13\"\nlog_dir: \"/var/opt/ttcs/log\"\nsubscription_mode: true\ncoordinator_address: \"c-gjk1994gj"
  }
]

About this extraction

This page contains the full source code of the Steamgjk/Nezha GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 78 files (416.5 KB), approximately 112.4k tokens, and a symbol index with 102 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!