Repository: Steamgjk/Nezha
Branch: main
Commit: 8db31f04af48
Files: 78
Total size: 416.5 KB

Directory structure:
gitextract_rknzum1x/

├── .github/
│   └── workflows/
│       └── build.yaml
├── .gitignore
├── .vscode/
│   └── settings.json
├── README.md
├── WORKSPACE
├── client/
│   ├── BUILD
│   ├── client.cc
│   ├── client.h
│   ├── client_config.h
│   └── client_run.cc
├── configs/
│   ├── dist/
│   │   ├── nezha-client-config.yaml
│   │   ├── nezha-proxy-config.yaml
│   │   ├── nezha-replica-config-0.yaml
│   │   ├── nezha-replica-config-1.yaml
│   │   ├── nezha-replica-config-2.yaml
│   │   └── nezha-replica-config.yaml
│   ├── local/
│   │   ├── nezha-client-config.yaml
│   │   ├── nezha-proxy-config.yaml
│   │   ├── nezha-replica-config-0.yaml
│   │   ├── nezha-replica-config-1.yaml
│   │   └── nezha-replica-config-2.yaml
│   ├── nezha-client-config-template.yaml
│   ├── nezha-proxy-config-template.yaml
│   └── nezha-replica-config-template.yaml
├── docs/
│   ├── Nezha.tla
│   ├── demo.md
│   └── tla-intro.md
├── external/
│   ├── gogoprotobuf.BUILD
│   └── googleapi.BUILD
├── lib/
│   ├── BUILD
│   ├── Rules.mk
│   ├── address.cc
│   ├── address.h
│   ├── common_struct.h
│   ├── common_type.h
│   ├── endpoint.cc
│   ├── endpoint.h
│   ├── message_handler.h
│   ├── message_type.cc
│   ├── message_type.h
│   ├── timer.h
│   ├── udp_socket_endpoint.cc
│   ├── udp_socket_endpoint.h
│   ├── utils.cc
│   ├── utils.h
│   └── zipfian.h
├── license.md
├── micro-bench/
│   ├── BUILD
│   ├── analysis.cc
│   ├── bench_receiver.cc
│   ├── bench_sender.cc
│   └── launch_micro.py
├── proto/
│   ├── BUILD
│   └── nezha_proto.proto
├── proxy/
│   ├── BUILD
│   ├── proxy.cc
│   ├── proxy.h
│   ├── proxy_config.h
│   └── proxy_run.cc
├── replica/
│   ├── BUILD
│   ├── replica.cc
│   ├── replica.h
│   ├── replica_config.h
│   └── replica_run.cc
├── scripts/
│   ├── analysis.py
│   ├── launch.py
│   ├── local_test.sh
│   └── ttcs-agent.cfg
├── third_party/
│   ├── concurrentqueue/
│   │   └── BUILD.bazel
│   ├── glog/
│   │   ├── BUILD.bazel
│   │   ├── BUILD.glog
│   │   └── glog.bzl
│   ├── junction/
│   │   ├── BUILD.bazel
│   │   └── junction.patch
│   ├── libev/
│   │   └── BUILD.bazel
│   ├── openssl/
│   │   └── BUILD.bazel
│   └── turf/
│       └── BUILD.bazel
└── ttcs-agent.cfg

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/build.yaml
================================================
# Bazel action to build & test specific targets.
name: Bazel build

on:
  push:
    branches: [main]
  pull_request:
    branches: [main]

jobs:
  build:
    name: Bazel build and run local test
    runs-on: ubuntu-20.04
    steps:
      - uses: actions/checkout@v3

      - name: Setup Bazel
        run: |
          sudo apt install -y apt-transport-https curl gnupg
          curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor >bazel-archive-keyring.gpg
          sudo mv bazel-archive-keyring.gpg /usr/share/keyrings
          echo "deb [arch=amd64 signed-by=/usr/share/keyrings/bazel-archive-keyring.gpg] https://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list
          sudo apt update
          sudo apt install -y bazel-5.2.0
          sudo mv /usr/bin/bazel-5.2.0 /usr/bin/bazel

      - name: Build
        run: |
          bazel build //replica/... //proxy/... //client/...
      - name: Run local test
        run: ./scripts/local_test.sh --github


================================================
FILE: .gitignore
================================================
/.obj
/.bin
/bazel-*


================================================
FILE: .vscode/settings.json
================================================
{
    "C_Cpp.formatting": "clangFormat",
    "C_Cpp.clang_format_fallbackStyle": "{BasedOnStyle: Google, IncludeBlocks: Preserve, DerivePointerAlignment: false, PointerAlignment: Left}",
    "editor.formatOnSave": true,
    "files.associations": {
        "*.inc": "cpp",
        "cctype": "cpp",
        "clocale": "cpp",
        "cmath": "cpp",
        "cstdarg": "cpp",
        "cstddef": "cpp",
        "cstdio": "cpp",
        "cstdlib": "cpp",
        "cstring": "cpp",
        "ctime": "cpp",
        "cwchar": "cpp",
        "cwctype": "cpp",
        "array": "cpp",
        "atomic": "cpp",
        "bit": "cpp",
        "*.tcc": "cpp",
        "bitset": "cpp",
        "chrono": "cpp",
        "cinttypes": "cpp",
        "condition_variable": "cpp",
        "cstdint": "cpp",
        "deque": "cpp",
        "list": "cpp",
        "map": "cpp",
        "set": "cpp",
        "unordered_map": "cpp",
        "unordered_set": "cpp",
        "vector": "cpp",
        "exception": "cpp",
        "algorithm": "cpp",
        "functional": "cpp",
        "iterator": "cpp",
        "memory": "cpp",
        "memory_resource": "cpp",
        "numeric": "cpp",
        "optional": "cpp",
        "random": "cpp",
        "ratio": "cpp",
        "regex": "cpp",
        "string": "cpp",
        "string_view": "cpp",
        "system_error": "cpp",
        "tuple": "cpp",
        "type_traits": "cpp",
        "utility": "cpp",
        "fstream": "cpp",
        "initializer_list": "cpp",
        "iomanip": "cpp",
        "iosfwd": "cpp",
        "iostream": "cpp",
        "istream": "cpp",
        "limits": "cpp",
        "mutex": "cpp",
        "new": "cpp",
        "ostream": "cpp",
        "shared_mutex": "cpp",
        "sstream": "cpp",
        "stdexcept": "cpp",
        "streambuf": "cpp",
        "thread": "cpp",
        "typeinfo": "cpp",
        "csignal": "cpp",
        "any": "cpp",
        "cfenv": "cpp",
        "forward_list": "cpp",
        "future": "cpp",
        "scoped_allocator": "cpp",
        "typeindex": "cpp",
        "valarray": "cpp",
        "variant": "cpp",
        "hash_map": "cpp",
        "hash_set": "cpp",
        "*.ipp": "cpp",
        "csetjmp": "cpp",
        "strstream": "cpp",
        "charconv": "cpp",
        "codecvt": "cpp",
        "complex": "cpp",
        "source_location": "cpp",
        "rope": "cpp",
        "slist": "cpp"
    }
}

================================================
FILE: README.md
================================================
# Nezha

<img src="docs/nezha-img.jpeg" width="200">

----
Nezha (哪吒) is a legendary figure in Chinese mythology. Nezha has 3 heads and 6 arms, so he/she achieves much better fault tolerance than ordinary people :)

PS: We have created [[an FAQ page](https://github.com/Steamgjk/Nezha/wiki)]. Please take a look for a better understanding of Nezha.

## Paper and Presentation
Nezha: Deployable and High-Performance Consensus Using Synchronized Clocks (VLDB version) [[pdf](https://www.vldb.org/pvldb/vol16/p629-geng.pdf)]


Nezha: Deployable and High-Performance Consensus Using Synchronized Clocks(Technical Report) [[pdf](docs/Nezha-technical-report.pdf)]


An early presentation of Nezha was made at [Stanford Platform Lab Winter Review 2022](https://platformlab.stanford.edu/winter-review/platform-lab-winter-review-2022/) [[slides](https://platformlab.stanford.edu/wp-content/uploads/2022/03/Jinkun-Geng.pdf)]


If you find our work helpful to your research or project, we would very appreciate it if you could **add a star** to our repo and/or **cite our papers**. The bibs for the papers are as below. 

```
@article{vldb23-nezha,
author = {Geng, Jinkun and Sivaraman, Anirudh and Prabhakar, Balaji and Rosenblum, Mendel}, 
title = {Nezha: Deployable and High-Performance Consensus Using Synchronized Clocks},
year = {2023},
journal = {Proceedings of the VLDB Endowment},
url = {https://www.vldb.org/pvldb/vol16/p629-geng.pdf},
publisher = {VLDB Endowment},
issn = {2150-8097},
volume = {16},
pages = {629-642},
numpages = {14}
}

@misc{nezha-tech,
  author = {Geng, Jinkun and Sivaraman, Anirudh and Prabhakar, Balaji and Rosenblum, Mendel},
  title = {Nezha: Deployable and High-Performance Consensus Using Synchronized Clocks},
  doi = {10.48550/ARXIV.2206.03285},
  url = {https://arxiv.org/abs/2206.03285},
  publisher = {arXiv},
  year = {2022},
}

```


## Clone Project

```
git clone --depth=1 https://github.com/Steamgjk/Nezha.git
```


## File Structure
The core part includes three modules (folders), i.e., 
- replica
- proxy
- client 

Each module is composed of three files: 
- a header file (e.g., replica.h), 
- a source implementation file (replica.cc), 
- a launching file (e.g., replica_run.cc). 

Each process reads an independent yaml file (e.g., nezha-replica-config-0.yaml) to get its full configuration, the sample configuration files are placed in the configs folder


## Install Bazel

We use Bazel 5.2.0 for building Nezha.

```
# Install bazel 5.2.0
# Please follow the instructions at https://bazel.build/install/ubuntu#install-on-ubuntu, 
# or simply run the following commands

sudo apt install -y apt-transport-https curl gnupg
curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor >bazel-archive-keyring.gpg
sudo mv bazel-archive-keyring.gpg /usr/share/keyrings
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/bazel-archive-keyring.gpg] https://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list
sudo apt update
sudo apt install -y bazel-5.2.0
sudo mv /usr/bin/bazel-5.2.0 /usr/bin/bazel
bazel --version
```

## Build Nezha with Bazel

Since Bazel is becoming popular, we have migrated nezha from Makefile-based building system to the bazel building system. The bazel version in use is 5.2.0

```
cd Nezha && bazel build //replica/... //client/... //proxy/...
```


After building the project successfully, the executable files will be generated in the folder named `bazel-bin`


## Single-Machine Tests

Please refer to [the single-machine instructions](docs/demo.md) to run Nezha under various scenarios (view change, request commit, recovery from failure of replica).

## Multi-Machine Tests

We use [scripts/launch.py](scripts/launch.py) to conduct distributed tests across multiple machines. After the tests have completed, [scripts/analysis.py](scripts/analysis.py) is used to analyze the results to generate performance numbers. The current scripts only support Google Cloud Platform (GCP). They require GCP credentials to create and delete VMs on GCP.


## Important Configuration Parameters
### Replica
- ```replica-ips``` must include 2f+1 ips
- ```replica-id``` starts from 0 to 2f
- ```index-transfer-batch```, ```request-key-transfer-batch```, ```request-transfer-batch```. The values of the three <em>batch parameters</em> should be carefully chosen in order not to overflow the [maximum size of UDP packets](https://stackoverflow.com/questions/1098897/what-is-the-largest-safe-udp-packet-size-on-the-internet). 

### Clients
- We support two types of clients, i.e., open-loop clients and closed-loop clients.
- Open-loop clients generate requests according to a Poisson process configured with a specific rate.
- Closed-loop clients use a sliding window protocol to keep a fixed number of requests in flight at any given time, release a new request when an old one is completed.
- ```is-openloop```:  When this flag is true, --poission-rate becomes meaningful.
- ```skew-factor``` and key-number decides the workload, which further affects the commutativity optimization

### Proxy
- ```shard-num``` decides how many threads will be launched. 1 shard includes 1 forwarding thread to forward client requests to replicas and 1 replying thread to receive and replies from replicas and does quorum check
- ```max-owd```  is used in the clamping function to estimate one-way delay, more details are described in Sec 4 [Adpative latency bound] of the paper.

## Performance Benchmark
Refer to [our paper](https://arxiv.org/pdf/2206.03285.pdf) for the relevant performance stats. Compared with the experimental version, we have refactored the codebase with some higher-performance libraries (e.g. libev instead of libevent) and data structures (e.g., ConcurrentMap and ConcurrentQueue). Besides, we have also conducted further optimization with the pipeline. The performance will be somewhat better than the original version used in the paper. New benchmark data will be updated soon. 


## Authors and Acknowledgment
Nezha project is developed and maintained by [Jinkun Geng](https://steamgjk.github.io/) and his three supervisors, i.e., [Prof. Anirudh Sivaraman](https://cs.nyu.edu/~anirudh/), [Prof. Balaji Prabhakar](https://web.stanford.edu/~balaji/) and [Prof. Mendel Rosenblum](http://web.stanford.edu/~mendel/).

We are fortunate to get the help from many researchers during the development of Nezha. Below we list and acknowledge them according to the timeline.

[Dr. Shiyu Liu](https://web.stanford.edu/~shiyuliu/) and [Dr. Feiran Wang](https://www.linkedin.com/in/feiran-wang/) joined the discussion during the early design of Nezha. Feiran explained the details of CRaft and the related correctness properties.  Shiyu explained the principles of Huygens and the other clock sync solutions.

[Prof. Dan Ports](https://drkp.net/), [Prof. Jialin Li](https://www.comp.nus.edu.sg/~lijl/) and [Dr. Ellis Michael](https://ellismichael.com/) provided helpful discussion related to Speculative Paxos and NOPaxos. Dan also gave us the pointer to crash vector and diskless recovery. 

[Prof. Jinyang Li](http://www.news.cs.nyu.edu/~jinyang/) listened to our early presentation of Nezha, and gave some useful feedback.

[Prof. Seo Jin Park](https://seojinpark.net/) discussed with us about the definition of linearizability and other correctness properties. Seo Jin also provided some explanation about CURP.

[Prof. Zhaoguo Wang](https://ipads.se.sjtu.edu.cn/pub/members/zhaoguo_wang) shared with us his experience in testing Raft.

The [Derecho team](https://derecho-project.github.io/) (Prof. Ken Birman, Dr. Weijia Song, Dr. Sagar Jha, Dr. Lorenzo Rosa, etc) offered technical support and discussion during our measurement of Derecho.

The [ClockWork](https://www.clockwork.io/) Staff (Dr. Yilong Geng and Dr. Deepak Merugu) offered technical support in deploying Huygens. Dr. Deepak Merugu also gave suggestions on the coding-styles of Nezha codebase. Katie Gioioso provided feedback on Nezha design. Bhagirath Mehta participated in the single-machine test of Nezha.

[Prof. Eugene Wu](http://www.cs.columbia.edu/~ewu/) provided suggestions on the revision of Nezha paper.

[Prof. Aurojit Panda](https://cs.nyu.edu/~apanda/) discussed with us about Nezha's correctness during leader change. Aurojit reviewed our draft and offered some constructive suggestions on the revision.

The [Raft community](https://groups.google.com/u/1/g/raft-dev/c/SmnAvZMufB0) offered much insightful discussion for us. Many community members discussed with us and helped to justify our design decisions about Nezha.


## License
Please refer to [license.md](license.md)

## Future Plan

(1) Conduct more functionality and performance tests to make Nezha more robust and optimized

(3) Replace [the etcd backend for Kubenetes](https://learnk8s.io/etcd-kubernetes) to boost the performance of Kubenetes.


================================================
FILE: WORKSPACE
================================================
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository", "new_git_repository")

http_archive(
    name = "rules_proto",
    sha256 = "e017528fd1c91c5a33f15493e3a398181a9e821a804eb7ff5acdd1d2d6c2b18d",
    strip_prefix = "rules_proto-4.0.0-3.20.0",
    urls = [
        "https://github.com/bazelbuild/rules_proto/archive/refs/tags/4.0.0-3.20.0.tar.gz",
    ],
)
load("@rules_proto//proto:repositories.bzl", "rules_proto_dependencies", "rules_proto_toolchains")
rules_proto_dependencies()
rules_proto_toolchains()


http_archive(
    name = "com_github_grpc_grpc",
    sha256 = "9f387689b7fdf6c003fd90ef55853107f89a2121792146770df5486f0199f400",
    urls = [
        "https://github.com/grpc/grpc/archive/refs/tags/v1.42.0.zip",
    ],
    strip_prefix = "grpc-1.42.0",
)
load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
grpc_deps()
load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
grpc_extra_deps()


http_archive(
    name = "googleapi",
    sha256 = "3ff2365822fb573cb1779ada5c2ac7899269cacd0836aef95ffe9d95779031f2",
    url = "https://github.com/googleapis/googleapis/archive/refs/tags/common-protos-1_3_1.zip", 
    strip_prefix = "googleapis-common-protos-1_3_1/",
    build_file="@//external:googleapi.BUILD",
)


http_archive(
    name = "etcd",
    sha256 = "580ce584dc7628efebb57f8c8240674918d334ad21e33186bbc5f6348f465bc1",
    url = "https://github.com/etcd-io/etcd/archive/refs/tags/v3.5.0.zip", 
    strip_prefix = "etcd-3.5.0/",
    build_file="@//external:etcd.BUILD",
)


http_archive(
    name = "gogoprotobuf",
    sha256 = "f89f8241af909ce3226562d135c25b28e656ae173337b3e58ede917aa26e1e3c",
    url = "https://github.com/gogo/protobuf/archive/refs/tags/v1.3.2.zip", 
    strip_prefix = "protobuf-1.3.2/",
    build_file="@//external:gogoprotobuf.BUILD",
)

git_repository(
    name = "com_github_jbeder_yaml_cpp",
    commit = "fcbb8193b94921e058be7b563aea053531e5b2d9",  # 19-Aug-2023
    remote = "https://github.com/jbeder/yaml-cpp.git",
    shallow_since = "1692473776 -0400",
)

new_git_repository(
    name = "com_github_cameron314_concurrentqueue",
    build_file = "//third_party/concurrentqueue:BUILD.bazel",
    commit = "6dd38b8a1dbaa7863aa907045f32308a56a6ff5d",
    shallow_since = "1686439287 -0400",
    remote = "https://github.com/cameron314/concurrentqueue.git",
)

new_git_repository(
    name = "com_github_preshing_junction",
    commit = "5ad3be7ce1d3f16b9f7ed6065bbfeacd2d629a08",
    shallow_since = "1518982100 -0500",
    patches = ["//third_party/junction:junction.patch"],
    patch_args = ["-p1"],
    build_file = "//third_party/junction:BUILD.bazel",
    remote = "https://github.com/preshing/junction",
)

new_git_repository(
    name = "com_github_preshing_turf",
    commit = "9ae0d4b984fa95ed5f823274b39c87ee742f6650", 
    shallow_since = "1484317994 -0500" ,
    build_file = "//third_party/turf:BUILD.bazel",
    remote = "https://github.com/preshing/turf",
)

new_git_repository(
    name = "com_github_enki_libev",
    commit = "93823e6ca699df195a6c7b8bfa6006ec40ee0003",
    shallow_since = "1463172876 -0700",
    build_file = "//third_party/libev:BUILD.bazel",
    remote = "https://github.com/enki/libev.git",
)

# Google gflags.
git_repository(
    name = "com_github_gflags_gflags",
    commit = "e171aa2d15ed9eb17054558e0b3a6a413bb01067",  # 11-Nov-2018
    remote = "https://github.com/gflags/gflags.git",
    shallow_since = "1541971260 +0000",
)

# Google glog.
new_git_repository(
    name = "com_github_google_glog",
    build_file = "//third_party/glog:BUILD.glog",
    commit = "ba8a9f6952d04d1403b97df24e6836227751454e",  # 7-May-2019
    remote = "https://github.com/google/glog.git",
    # Shallow since doesn't work here for some weird reason. See
    # https://github.com/bazelbuild/bazel/issues/10292
    # shallow_since = "1557212520 +0000",
)

# Google protobuf.
git_repository(
    name = "com_google_protobuf",
    commit = "21027a27c4c2ec1000859ccbcfff46d83b16e1ed",  # 21-Apr-2022, v3.20.1
    remote = "https://github.com/protocolbuffers/protobuf",
    shallow_since = "1650589240 +0000",
)

http_archive(
    name = "rules_foreign_cc",
    sha256 = "2a8000ce03dd9bb324bc9bb7f1f5d01debac406611f4d9fedd385192718804f0",
    strip_prefix = "rules_foreign_cc-60813d57a0e99be1a009c1a0e9627cdbe81fcd19",
    url = "https://github.com/bazelbuild/rules_foreign_cc/archive/60813d57a0e99be1a009c1a0e9627cdbe81fcd19.tar.gz",
)

load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies")

rules_foreign_cc_dependencies()

http_archive(
    name = "openssl",
    build_file = "//third_party/openssl:BUILD.bazel",
    sha256 = "23011a5cc78e53d0dc98dfa608c51e72bcd350aa57df74c5d5574ba4ffb62e74",
    strip_prefix = "openssl-OpenSSL_1_1_1d",
    urls = ["https://github.com/openssl/openssl/archive/OpenSSL_1_1_1d.tar.gz"],
)

http_archive(
    name = "com_github_nelhage_rules_boost",
    url = "https://github.com/nelhage/rules_boost/archive/96e9b631f104b43a53c21c87b01ac538ad6f3b48.tar.gz",
    strip_prefix = "rules_boost-96e9b631f104b43a53c21c87b01ac538ad6f3b48",
    sha256 = "5ea00abc70cdf396a23fb53201db19ebce2837d28887a08544429d27783309ed",
)
load("@com_github_nelhage_rules_boost//:boost/boost.bzl", "boost_deps")
boost_deps()


================================================
FILE: client/BUILD
================================================
load("@rules_proto//proto:defs.bzl", "proto_library")

cc_library(
    name = "client_config",
    hdrs = ["client_config.h"],
    deps = [
        "@com_github_jbeder_yaml_cpp//:yaml-cpp",
    ],
)

cc_library(
    name = "client_class",
    srcs = ["client.cc"],
    hdrs = ["client.h"],
    deps = [
        "//proto:nezha_cc_proto",
        "//lib:zipfian",
        "//lib:utils",
        ":client_config",
    ],
)


cc_binary(
    name = "nezha_client",
    srcs = ["client_run.cc"],
    deps = [
        ":client_class",
    ],
)


================================================
FILE: client/client.cc
================================================
#include "client/client.h"

namespace nezha {
Client::Client(const std::string& configFile) {
  hop3s.reserve(500000);
  hop4s.reserve(500000);
  totals.reserve(500000);

  LOG(INFO) << "Loading config information from " << configFile;
  std::string error = clientConfig_.parseConfig(configFile);
  if (error != "") {
    LOG(ERROR) << "Error loading client config: " << error << " Exiting.";
    exit(1);
  }
  clientId_ = clientConfig_.clientId;
  LOG(INFO) << "clientId=" << clientId_;
  std::string clientIP = clientConfig_.clientIp;
  LOG(INFO) << "clientIP=" << clientIP;
  int requestPort = clientConfig_.requestPort;
  LOG(INFO) << "requestPort=" << requestPort;
  LOG(INFO) << "endPointType=" << clientConfig_.endpointType;
  requestEP_ =
      CreateEndpoint(clientConfig_.endpointType, clientIP, requestPort, true);
  replyHandler_ = CreateMsgHandler(
      clientConfig_.endpointType,
      [](MessageHeader* msgHdr, char* msgBuffer, Address* sender, void* ctx) {
        ((Client*)ctx)->ReceiveReply(msgHdr, msgBuffer, sender);
      },
      this);

  monitorTimer_ = new Timer(
      [](void* ctx, void* receiverEP) {
        // LOG(INFO) << "Monitor running " << ((Client*)ctx)->running_;
        if (((Client*)ctx)->running_ == false) {
          ((Endpoint*)receiverEP)->LoopBreak();
        }
      },
      10 /*Checks the status every 10ms*/, this);

  /** Fetch the addreses of all proxies and organize them as a two-dimensional
   * vector */
  proxyAddrs_.resize(clientConfig_.proxyIps.size());
  for (uint32_t i = 0; i < proxyAddrs_.size(); i++) {
    proxyAddrs_[i].resize(clientConfig_.proxyShardNum);
    for (uint32_t j = 0; j < proxyAddrs_[i].size(); j++) {
      proxyAddrs_[i][j] = new Address(clientConfig_.proxyIps[i],
                                      clientConfig_.proxyRequestPortBase + j);
    }
  }

  /** If the client is a open-loop client, generate the poission trace for the
   * client */
  if (clientConfig_.isOpenLoop) {
    poissonRate_ = clientConfig_.poissonRate;
    LOG(INFO) << "OpenLoop Client rate=" << poissonRate_;
    poissonTrace_.resize(1000, 0);
    std::default_random_engine generator(clientId_);  // clientId as the seed
    std::poisson_distribution<int> distribution(poissonRate_);
    for (int i = 0; i < 1000; i++) {
      int reqNum = distribution(generator);
      if (reqNum < 0) {
        poissonTrace_[i] = 0;
      } else {
        poissonTrace_[i] = reqNum;
      }
    }
  }
  /** Generate zipfian workload */
  LOG(INFO) << "keyNum=" << clientConfig_.keyNum
            << "\tskewFactor=" << clientConfig_.skewFactor
            << "\twriteRatio=" << clientConfig_.writeRatio;
  zipfianKeys_.resize(1000000, 0);
  retryTimeoutUs_ = clientConfig_.requestRetryTimeUs;
  if (clientConfig_.keyNum > 1) {
    std::default_random_engine generator(clientId_);  // clientId as the seed
    zipfian_int_distribution<uint32_t> zipfianDistribution(
        0, clientConfig_.keyNum - 1, clientConfig_.skewFactor);
    for (uint32_t i = 0; i < zipfianKeys_.size(); i++) {
      zipfianKeys_[i] = zipfianDistribution(generator);
    }
  }

  /** Initialize */
  committedReqId_ = 0;
  reclaimedReqId_ = 0;
  nextReqId_ = 1;
  retryNumber_ = 0;
  committedNum_ = 0;
  fastCommitNum_ = 0;
  fastWriteNum_ = 0;
}

void Client::Run() {
  running_ = true;
  LaunchThreads();
  for (auto& kv : threadPool_) {
    LOG(INFO) << "Join " << kv.first;
    kv.second->join();
    LOG(INFO) << "Join Complete " << kv.first;
  }
  LOG(INFO) << "Run Terminated ";
}

void Client::LaunchThreads() {
  threadPool_["LogTd"] = new std::thread(&Client::LogTd, this);
  threadPool_["ProcessReplyTd"] =
      new std::thread(&Client::ProcessReplyTd, this);
  if (clientConfig_.isOpenLoop) {
    LOG(INFO) << "OpenLoop Client";
    threadPool_["OpenLoopSubmissionTd"] =
        new std::thread(&Client::OpenLoopSubmissionTd, this);
  } else {
    LOG(INFO) << "ClosedLoop Client";
    threadPool_["CloseLoopSubmissionTd"] =
        new std::thread(&Client::CloseLoopSubmissionTd, this);
  }
}

void Client::ProcessReplyTd() {
  /** Register the message handler and timer. Then this thread will run in an
   * event-driven mode, i.e, when message comes, it calls the registered message
   * handler */
  requestEP_->RegisterMsgHandler(replyHandler_);
  requestEP_->RegisterTimer(monitorTimer_);
  LOG(INFO) << "Loop Run ";
  requestEP_->LoopRun();
  LOG(INFO) << "Loop Run Exit ";
}

void Client::ReceiveReply(MessageHeader* msgHdr, char* msgBuffer,
                          Address* sender) {
  if (msgHdr->msgLen < 0) {
    return;
  }
  Reply reply;
  if (msgHdr->msgType == MessageType::COMMIT_REPLY &&
      reply.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
    committedNum_++;
    if (reply.replytype() == MessageType::FAST_REPLY) {
      fastCommitNum_++;
      if (reply.iswrite()) {
        fastWriteNum_++;
      }
    }

    // if (committedNum_ % 100000 == 0) {
    //   LOG(INFO) << "commitNum=" << committedNum_
    //             << "\tfastWriteNum_=" << fastWriteNum_
    //             << "\tFastCommitNum=" << fastCommitNum_ <<
    //             "\tWriteRatioCommit="
    //             << (fastWriteNum_ * 100.0 / fastCommitNum_)
    //             << "\t fastRatio=" << (fastCommitNum_ * 100.0 /
    //             committedNum_);
    // }

    if (committedReqId_ < reply.reqid()) {
      committedReqId_ = reply.reqid();
      // // LOG(INFO) << "committedReqId_=" << committedReqId_;
      // uint64_t st = outstandingRequestSendTime_.get(reply.reqid());
      // uint64_t et = GetMicrosecondTimestamp();
      // ls.push_back((et - st));
      // if (ls.size() >= 1000) {
      //   for (uint32_t i = 0; i < 1000; i++) {
      //     printf("%u\t", ls[i]);
      //     if (i % 20 == 0) {
      //       printf("\n");
      //     }
      //   }
      //   exit(0);
      // }
    }
    uint64_t sendTime = outstandingRequestSendTime_.get(reply.reqid());
    if (sendTime > 0) {
      /** The corresponding request has not been committed, because it is still
       * in outstandingRequestSendTime_, so we wan to mark it as committed,
       * i.e., erase from outstandingRequestSendTime_
       */

      /**
       * Generate log information and pass to logQu_, which will be handled by
       * LogTd
       * */
      uint64_t recvTime = GetMicrosecondTimestamp();
      LogInfo* log = new LogInfo();
      lastCommittedReqId_ = reply.reqid();
      *log = {reply.reqid(), sendTime, recvTime, reply.replytype()};
      outstandingRequestSendTime_.erase(reply.reqid());
      logQu_.enqueue(log);
    }
  }
}

void Client::OpenLoopSubmissionTd() {
  int roundRobinIdx = 0;
  uint64_t startTime = GetMicrosecondTimestamp();
  uint64_t endTime = startTime + clientConfig_.durationSec * 1000000;

  srandom(clientId_);
  endTime += 10 * 1000ul * 1000ul;
  LOG(INFO) << "Expected to end at " << endTime;
  // Poisson rate is ``10ms as one unit''
  for (uint32_t i = 0; i < clientConfig_.durationSec * 100; i++) {
    if (!running_) {
      return;
    }
    if (GetMicrosecondTimestamp() >= endTime) {
      // Client has executed long enough, should terminate
      LOG(INFO) << "Terminating soon...";
      running_ = false;
      return;
    }
    uint32_t reqNum = poissonTrace_[i % poissonTrace_.size()];
    if (reqNum <= 0) {
      usleep(10000);
      continue;
    }
    uint32_t intval = 10000 / reqNum;
    uint64_t startTime = GetMicrosecondTimestamp();
    for (uint32_t j = 0; j < reqNum; j++) {
      while (GetMicrosecondTimestamp() < startTime + j * intval) {
      }
      // Send the request
      uint32_t mapIdx =
          roundRobinIdx % (proxyAddrs_.size() * proxyAddrs_[0].size());
      Request* request = NULL;
      if (retryQu_.try_dequeue(request)) {
        // Retry this request
        Address* roundRobinAddr = proxyAddrs_[mapIdx % proxyAddrs_.size()]
                                             [mapIdx / proxyAddrs_.size()];
        // LOG(INFO) << "Resend " << request->reqid() << "to "
        //           << mapIdx % proxyAddrs_.size() << "\t"
        //           << mapIdx / proxyAddrs_.size();
        requestEP_->SendMsgTo(*roundRobinAddr, *request,
                              MessageType::CLIENT_REQUEST);
        outstandingRequestSendTime_.assign(request->reqid(),
                                           GetMicrosecondTimestamp());
        roundRobinIdx++;
      } else {
        // submit new requests
        request = new Request();
        request->set_clientid(clientId_);
        request->set_reqid(nextReqId_);
        if (random() % 100 < 100 * writeRatio_) {
          request->set_iswrite(true);
        } else {
          request->set_iswrite(false);
        }

        request->set_key(zipfianKeys_[nextReqId_ % zipfianKeys_.size()]);
        // // if (nextReqId_ % 10 == 1 && clientId_ <= 10) {
        // if (clientId_ <= 12) {
        //   if (nextReqId_ % 2 == 1)
        //     request->set_iswrite(true);
        //   else
        //     request->set_iswrite(false);

        //   // request->set_iswrite(true);
        //   // LOG(INFO) << "One Write " << request->key()
        //   //           << " reqId=" << request->reqid();
        // } else {
        //   exit(0);
        // }

        // request->set_key(nextReqId_ % 100000 + 100000 * (clientId_ - 1));
        Address* roundRobinAddr = proxyAddrs_[mapIdx % proxyAddrs_.size()]
                                             [mapIdx / proxyAddrs_.size()];
        // LOG(INFO) << "Sed " << request->reqid() << "to "
        //           << mapIdx % proxyAddrs_.size() << "\t"
        //           << mapIdx / proxyAddrs_.size();
        requestEP_->SendMsgTo(*roundRobinAddr, *request,
                              MessageType::CLIENT_REQUEST);
        outstandingRequests_.assign(request->reqid(), request);
        outstandingRequestSendTime_.assign(request->reqid(),
                                           GetMicrosecondTimestamp());
        nextReqId_++;
        roundRobinIdx++;
      }
    }
  }

  LOG(INFO) << "Terminating soon... after "
            << (endTime - GetMicrosecondTimestamp()) * 1e-6 << " seconds";
  while (GetMicrosecondTimestamp() < endTime) {
    // Client has executed long enough, should terminate
    usleep(1000);
  }
  running_ = false;
}

void Client::CloseLoopSubmissionTd() {
  int roundRobinIdx = 0;
  uint64_t startTime = GetMicrosecondTimestamp();
  uint64_t endTime = startTime + clientConfig_.durationSec * 1000000;
  endTime += 10 * 1000ul * 1000ul;
  LOG(INFO) << "Expected to end at " << endTime;
  srand(clientId_);
  while (running_) {
    if (GetMicrosecondTimestamp() >= endTime) {
      // Client has executed long enough, should terminate
      LOG(INFO) << "Terminating soon...";
      running_ = false;
      return;
    }
    Request* request = NULL;
    uint32_t mapIdx =
        roundRobinIdx % (proxyAddrs_.size() * proxyAddrs_[0].size());
    if (nextReqId_ == committedReqId_ + 1) {
      // submit new request
      request = new Request();
      request->set_clientid(clientId_);
      request->set_reqid(nextReqId_);
      if (random() % 100 < 100 * writeRatio_) {
        request->set_iswrite(true);
      } else {
        request->set_iswrite(false);
      }
      request->set_key(zipfianKeys_[nextReqId_ % zipfianKeys_.size()]);
      Address* roundRobinAddr =
          proxyAddrs_[mapIdx % proxyAddrs_.size()][mapIdx / proxyAddrs_.size()];
      requestEP_->SendMsgTo(*roundRobinAddr, *request,
                            MessageType::CLIENT_REQUEST);
      outstandingRequests_.assign(request->reqid(), request);
      outstandingRequestSendTime_.assign(request->reqid(),
                                         GetMicrosecondTimestamp());
      nextReqId_++;
      roundRobinIdx++;
    } else {
      if (retryQu_.try_dequeue(request)) {
        // have some requests to retry
        Address* roundRobinAddr = proxyAddrs_[mapIdx % proxyAddrs_.size()]
                                             [mapIdx / proxyAddrs_.size()];
        requestEP_->SendMsgTo(*roundRobinAddr, *request,
                              MessageType::CLIENT_REQUEST);
        outstandingRequestSendTime_.assign(request->reqid(),
                                           GetMicrosecondTimestamp());
        roundRobinIdx++;
      }
    }
  }
  LOG(INFO) << "Terminating soon... after "
            << (endTime - GetMicrosecondTimestamp()) * 1e-6 << " seconds";
  while (GetMicrosecondTimestamp() < endTime) {
    // Client has executed long enough, should terminate
    usleep(1000);
  }
  running_ = false;
}

void Client::LogTd() {
  LogInfo* log = NULL;
  uint64_t startTime, endTime;
  uint32_t lastSubmitteddReqId = 0;
  uint32_t lastCountCommitedReq = 0;
  uint32_t latencySample = 0;

  std::ofstream ofs("Client-Stats-" + std::to_string(clientId_));
  ofs << "ReqId,SendTime,CommitTime,CommitType" << std::endl;

  startTime = GetMicrosecondTimestamp();
  while (running_) {
    endTime = GetMicrosecondTimestamp();
    if (endTime - startTime >= 5000000) {
      float duration = (endTime - startTime) * 1e-6;
      uint32_t submittedReqNum = nextReqId_ - 1 - lastSubmitteddReqId;
      uint32_t committedReqNum = committedNum_ - lastCountCommitedReq;
      float submissionRate = submittedReqNum / duration;
      float commitRate = committedReqNum / duration;
      lastSubmitteddReqId = nextReqId_ - 1;
      lastCountCommitedReq = committedNum_;
      startTime = endTime;
      LOG(INFO) << "endTime=" << endTime << "\t"
                << "committedNum_ = " << committedNum_ << "\t"
                << "logQuLen =" << logQu_.size_approx() << "\t"
                << "committedReqId_=" << committedReqId_ << "\t"
                << "nextReqId_=" << nextReqId_ << "\t"
                << "lastCommittedReqId_=" << lastCommittedReqId_ << "\t"
                << "submissionRate=" << submissionRate << " req/sec\t"
                << "commitRate=" << commitRate << " req/sec"
                << "\t"
                << "FastCommitRatio=" << fastCommitNum_ * 100.0 / committedNum_
                << "\t"
                << "latency(Sample)=" << latencySample << " us"
                << "\t"
                << "retryNum=" << retryNumber_;

      ofs.flush();
    }
    if (logQu_.try_dequeue(log)) {
      // LOG(INFO) << "committedReqId_=" << committedReqId_ << "\t" << "reqId="
      // << log->reqId;
      while (committedReqId_ + 1 <= log->reqId) {
        if (outstandingRequestSendTime_.get(committedReqId_ + 1) == 0) {
          // this reqId has also been committed (i.e. cannot find its footprint)
          // advance committedReqId;
          committedReqId_++;
        } else {
          break;
        }
      }

      latencySample = log->commitTime - log->sendTime;

      // log stats
      ofs << log->toString() << std::endl;
      delete log;
    }

    // // Check whether any requests need retry
    // for (uint32_t reqId = committedReqId_ + 1; reqId < nextReqId_; reqId++) {
    //   uint64_t sendTime = outstandingRequestSendTime_.get(reqId);
    //   if (sendTime > 0) {
    //     // Find it
    //     if (GetMicrosecondTimestamp() - sendTime > retryTimeoutus_) {
    //       // timeout, should retry
    //       Request* request = outstandingRequests_.get(reqId);
    //       LOG(INFO) << "Timeout Retry " << request->reqid();
    //       outstandingRequestSendTime_.erase(reqId);
    //       retryQu_.enqueue(request);
    //       retryNumber_++;
    //     }
    //   }
    // }

    while (reclaimedReqId_ + 1000 < committedReqId_) {
      // do not reclaim request too aggressive
      // If we reclaim too aggressive, there can be some edge case of dangling
      // request pointer
      Request* request = outstandingRequests_.get(reclaimedReqId_);
      if (request) {
        outstandingRequests_.erase(request->reqid());
        delete request;
      }
      reclaimedReqId_++;
    }
  }
  LOG(INFO) << "The runtime have been terminated, we still need to dump "
            << logQu_.size_approx() << " Logs before exit";

  uint32_t cnt = 0;
  while (logQu_.try_dequeue(log)) {
    // log stats
    ofs << log->toString() << std::endl;
    delete log;
    cnt++;
    if (cnt % 10000 == 0) {
      LOG(INFO) << "Remaining Log Number " << logQu_.size_approx();
      ofs.flush();
    }
  }
  ofs.flush();
  LOG(INFO) << "Dump Finished";
}

void Client::Terminate() {
  LOG(INFO) << "Terminating...";
  running_ = false;
}

Client::~Client() {
  for (auto& kv : threadPool_) {
    delete kv.second;
  }
  while (reclaimedReqId_ <= nextReqId_) {
    Request* request = outstandingRequests_.get(reclaimedReqId_);
    if (request) {
      outstandingRequests_.erase(request->reqid());
      delete request;
    }
    reclaimedReqId_++;
  }
}
}  // namespace nezha

================================================
FILE: client/client.h
================================================
#include <yaml-cpp/yaml.h>
#include <fstream>
#include <iostream>
#include "client_config.h"
#include "lib/utils.h"
#include "lib/zipfian.h"
#include "proto/nezha_proto.pb.h"

namespace nezha {
using namespace nezha::proto;
/** LogInfo is used to dump some performance stats, which can be extended to
 * include more metrics */
struct LogInfo {
  uint32_t reqId;
  uint64_t sendTime;
  uint64_t commitTime;
  uint32_t commitType;
  std::string toString() {
    std::string ret =
        (std::to_string(reqId) + "," + std::to_string(sendTime) + "," +
         std::to_string(commitTime) + "," + std::to_string(commitType));
    return ret;
  }
};

/**
 * Refer to client_run.cc, the runnable program only needs to instantiate a
 * client object with a configuration file. Then it calls Run() method to run
 * and calls Terminate() method to stop
 */
class Client {
 private:
  /** All the configuration parameters for client are included in
   * clientConfig_*/
  ClientConfig clientConfig_;
  /** Each thread is given a unique name (key) and stored in the pool */
  std::map<std::string, std::thread*> threadPool_;
  /** The endpoint uses to submit request to proxies */
  Endpoint* requestEP_;

  /** The message handler used to handle replies (from proxies) */
  struct MessageHandler* replyHandler_;
  /** The timer periodically monitor the status of the client, and break the
   * blocking endpoint when the client is about to terminate */
  struct Timer* monitorTimer_;

  /** Flag to Run/Terminate threads */
  std::atomic<bool> running_;

  /** Each client is assigned with a unqiue id */
  int clientId_;

  /** Open-Loop submission related: the client's submission rate follows a
   * poisson distribution. We use 10ms as the basic interval and generate random
   * numbers with reference to poissonRate_, stored in poissonTrace_. Then the
   * open-loop clients submit poissonTrace_[i] requests in the ith interval.
   *
   * Regarding the definition of open-loop and closed-loop submission, refer to
   * ``evaluation method`` para of Sec 7.1 in our paper
   * */
  int poissonRate_;

  /** The next requestId to be submitted */
  std::atomic<uint32_t> nextReqId_;

  /** Requests whose requestId less or equal to committedReqId_ have been
   * committed */
  std::atomic<uint32_t> committedReqId_;

  /** Requests whose requestId less or equal to reclaimedReqId_ have been
   * reclaimed (memory freed) */
  std::atomic<uint32_t> reclaimedReqId_;
  std::vector<uint32_t> poissonTrace_;

  /** To communicate between OpenLoopSubmissionTd/CloseLoopSubmissionTd and
   * LogTd The LogTd monitors the outstanding requests (i.e. which have been
   * submitted but have not been committed). If some request has not been
   * committed after a certain time, the LogTd will enqueue the request to
   * retryQu, so that the OpenLoopSubmissionTd/CloseLoopSubmissionTd will
   * retry them */
  ConcurrentQueue<Request*> retryQu_;

  /** The addresses of proxies. Since we can have multiple proxies, and each
   * proxies can have multiple shards, we use a two-dimensional vector to store
   * the addresses, i.e., proxyAddrs[i][j] indicates the address of the jth
   * shard of the ith proxy */
  std::vector<std::vector<Address*>> proxyAddrs_;

  /** To test commutativity, we generate different zipfian workloads and write
   * ratios, i.e., we generate random numbers following the zipfian
   * distribution. These random numbers are stored in zipfianKeys_ and serve as
   * the keys that will be written/read by requests */
  std::vector<uint32_t> zipfianKeys_;

  float writeRatio_;

  /** Those requests which have been submitted but not yet committed (key is the
   * requestId)*/
  ConcurrentMap<uint32_t, Request*> outstandingRequests_;

  /** Record the send time of the requests, together with retryTimeoutus_, to
   * decide whether the request needes to be retried*/
  ConcurrentMap<uint32_t, uint64_t> outstandingRequestSendTime_;

  /** Used by LogTd to monitor outstanding reuqests. If they cannot be committed
   * within retryTimeoutUs_ (measured in macro-seconds), they should be retried
   * **/
  uint32_t retryTimeoutUs_;

  /** To communicate between ProcessReplyTd and LogTd */
  ConcurrentQueue<LogInfo*> logQu_;

  /** Performance counters, to show how many requests are retried/committed */
  uint32_t retryNumber_;
  uint32_t committedNum_;
  uint32_t fastCommitNum_;
  uint32_t fastWriteNum_;

  /** Stats */
  std::vector<uint32_t> hop3s;
  std::vector<uint32_t> hop4s;
  std::vector<uint32_t> totals;

  /** Launch all the threads, only called once during the lifetime of the
   * client*/
  void LaunchThreads();

  /** Functions whose names are ended with ``Td`` will be used to instantiate
   * threads.
   *
   * For the client, there are mainly three worker threads running:
   *
   * (1) OpenLoopSubmissionTd/CloseLoopSubmissionTd submits requests. A client
   * can be either open-loop client or closed-loop client, but cannot be both.
   *
   * (2) ProcessReplyTd receives and processes the reply messages, and handle
   * the log information to LogTd
   *
   * (3) LogTd dumps logs and also monitors the oustanding requests. If the
   * requests have not been committed after a certain time (retryTimeoutus_),
   * then LogTd will ask OpenLoopSubmissionTd/CloseLoopSubmissionTd to resubmit
   * this reuqest to proxies
   * */
  void ProcessReplyTd();
  void OpenLoopSubmissionTd();
  void CloseLoopSubmissionTd();
  void LogTd();

  /** The message handler to handle messages from proxies. The function is used
   * to instantiate a replyHandler_ and registered to requestEP_ */
  void ReceiveReply(MessageHeader* msgHdr, char* msgBuffer, Address* sender);

 public:
  /** Client accepts a config file, which contains all the necessary information
   * to instantiate the object, then it can call Run method
   *  */
  Client(const std::string& configFile = "../configs/nezha-client-config.yaml");
  void Run();
  void Terminate();
  ~Client();

  /** For debug */
  uint64_t lastCommittedReqId_;
  std::vector<uint32_t> ls;
};

}  // namespace nezha

================================================
FILE: client/client_config.h
================================================
#include <glog/logging.h>
#include <stdint.h>
#include <yaml-cpp/yaml.h>
#include <string>
#include <vector>

struct ClientConfig {
  int clientId;
  std::string clientIp;
  int endpointType;
  int requestPort;
  uint32_t proxyMaxOwd;
  int proxyReplyPortBase;
  bool isOpenLoop;
  int poissonRate;
  uint32_t durationSec;
  int keyNum;
  double skewFactor;
  double writeRatio;
  int requestRetryTimeUs;

  int proxyRequestPortBase;
  std::vector<std::string> proxyIps;
  int proxyShardNum;

  // Parses yaml file configFilename and fills in fields of ProxyConfig
  // accordingly. Returns an error message or "" if there are no errors.
  std::string parseConfig(std::string configFilename) {
    YAML::Node config;
    try {
      config = YAML::LoadFile(configFilename);
    } catch (const YAML::BadFile& e) {
      return "Error loading config file:" + e.msg + ".";
    }
    LOG(INFO) << "Using config:\n " << config;

    std::string key;  // Keep track of current key for better error messages
    try {
      key = "client-id";
      clientId = config[key].as<int>();
      key = "client-ip";
      clientIp = config[key].as<std::string>();
      key = "endpoint-type";
      endpointType = config[key].as<int>();
      key = "request-port";
      requestPort = config[key].as<int>();
      key = "is-openloop";
      isOpenLoop = config[key].as<bool>();
      key = "poisson-rate";
      poissonRate = config[key].as<int>();
      key = "duration-sec";
      durationSec = config[key].as<uint32_t>();
      key = "key-num";
      keyNum = config[key].as<int>();
      key = "skew-factor";
      skewFactor = config[key].as<double>();
      key = "write-ratio";
      writeRatio = config[key].as<double>();
      key = "request-retry-time-us";
      requestRetryTimeUs = config[key].as<int>();

      key = "proxy-ips";
      for (uint32_t i = 0; i < config[key].size(); i++) {
        proxyIps.push_back(config[key][i].as<std::string>());
      }
      key = "proxy-shards";
      proxyShardNum = config[key].as<int>();
      key = "proxy-request-port-base";
      proxyRequestPortBase = config[key].as<int>();

      return "";
    } catch (const YAML::BadConversion& e) {
      if (config[key]) {
        return "Error parsing config field " + key + ": " + e.msg + ".";
      } else {
        return "Error parsing config field " + key + ": key not found.";
      }
    } catch (const std::exception& e) {
      return "Error parsing config field " + key + ": " + e.what() + ".";
    }
  }
};

================================================
FILE: client/client_run.cc
================================================
#include "client/client.h"
DEFINE_string(config, "nezhav2/config/nezha-client-config-0.yaml", "The config file for the client");
nezha::Client* client = NULL;
void Terminate(int para) {
    client->Terminate();
}
int main(int argc, char* argv[]) {
    gflags::ParseCommandLineFlags(&argc, &argv, true);
    google::InitGoogleLogging(argv[0]);
    FLAGS_logtostderr = 1;
    signal(SIGINT, Terminate);
    client = new nezha::Client(FLAGS_config);
    client->Run();
    delete client;
}

================================================
FILE: configs/dist/nezha-client-config.yaml
================================================
---
print-config: true
proxy-info:
  proxy-ips:
    - "10.128.2.13"
  proxy-shards: 1
  request-port-base: 32000
client-info:
  client-id: 1
  client-ip: "10.128.2.14"
  request-port: 32912
  is-openloop: true
  poisson-rate: 10 # it means the client sends x reqs/10ms on average
  duration-sec: 60 # it means the duration of the client runs (second)
  key-num: 1000000 # when key-num is 1, it means there is no commutativity optimization
  skew-factor: 0.5 # 0-0.99 The higher the zipfian factor is, the more skewed the workload is
  request-retry-time-us: 10000 # After the request is submitted, if we cannot get the response after such long time, then we will retry


================================================
FILE: configs/dist/nezha-proxy-config.yaml
================================================
---
print-config: true
# Replica Info
replica-info:
  replica-ips:
    - "10.128.2.10"
    - "10.128.2.11"
    - "10.128.2.12"
  receiver-shards: 1 # The number of threads to receive threads
  receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
  initial-owd: 80 # The initial one-way delay (us) between replicas and proxies
# Proxy Info
proxy-info:
  proxy-id: 1
  proxy-ip: "10.128.2.13"
  shard-num: 1
  request-port-base: 32000
  reply-port-base: 33000


================================================
FILE: configs/dist/nezha-replica-config-0.yaml
================================================
---
print-config: true
replica-ips:
  - "10.128.2.10"
  - "10.128.2.11"
  - "10.128.2.12"
replica-id: 0
receiver-shards: 1 # The number of threads to receive threads
process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance
reply-shards: 1 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-max-batch: 30
request-transfer-max-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # when commutativity is enabled for test, we need the specified key number to detect commutativity
owd-estimation-window: 1000 # we use a sliding window to get moving median of one-way delays, here is the length of the window


================================================
FILE: configs/dist/nezha-replica-config-1.yaml
================================================
---
print-config: true
replica-ips:
  - "10.128.2.10"
  - "10.128.2.11"
  - "10.128.2.12"
replica-id: 1
receiver-shards: 1 # The number of threads to receive threads
process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance
reply-shards: 1 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-max-batch: 30
request-transfer-max-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # when commutativity is enabled for test, we need the specified key number to detect commutativity
owd-estimation-window: 1000 # we use a sliding window to get moving median of one-way delays, here is the length of the window


================================================
FILE: configs/dist/nezha-replica-config-2.yaml
================================================
---
print-config: true
replica-ips:
  - "10.128.2.10"
  - "10.128.2.11"
  - "10.128.2.12"
replica-id: 2
receiver-shards: 1 # The number of threads to receive threads
process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance
reply-shards: 1 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-max-batch: 30
request-transfer-max-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # when commutativity is enabled for test, we need the specified key number to detect commutativity
owd-estimation-window: 1000 # we use a sliding window to get moving median of one-way delays, here is the length of the window


================================================
FILE: configs/dist/nezha-replica-config.yaml
================================================
---
print-config: true
replica-ips:
  - "10.128.2.10"
  - "10.128.2.11"
  - "10.128.2.12"
replica-id: 0
receiver-shards: 1 # The number of threads to receive threads
process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance
reply-shards: 1 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-max-batch: 30
request-transfer-max-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # when commutativity is enabled for test, we need the specified key number to detect commutativity
owd-estimation-window: 1000 # we use a sliding window to get moving median of one-way delays, here is the length of the window


================================================
FILE: configs/local/nezha-client-config.yaml
================================================
---
client-id: 1
client-ip: "127.0.0.5"
endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
request-port: 32912
is-openloop: true
poisson-rate: 1 # it means the client sends x reqs/10ms on average (should be larger than 10, otherwise, the submission rate is not accurate)
duration-sec: 60 # it means the duration of the client runs (second)
key-num: 1000000 # when key-num is 1, it means there is no commutativity optimization
skew-factor: 0.5 # 0-0.99 The higher the zipfian factor is, the more skewed the workload is
write-ratio: 0.5 # 0-1, the ratio of write requests
request-retry-time-us: 100000 # After the request is submitted, if we cannot get the response after such long time, then we will retry

# proxy info
proxy-ips:
  - "127.0.0.4"
proxy-shards: 1
proxy-request-port-base: 32000


================================================
FILE: configs/local/nezha-proxy-config.yaml
================================================
---
# Proxy Info
proxy-endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
proxy-id: 1
proxy-ip: "127.0.0.4"
proxy-shard-num: 1
proxy-max-owd: 200
proxy-request-port-base: 32000
proxy-reply-port-base: 33000

# Replica Info
replica-ips:
  - "127.0.0.1"
  - "127.0.0.2"
  - "127.0.0.3"
replica-receiver-shards: 1 # The number of threads to receive threads
replica-receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
replica-initial-owd: 80 # The initial one-way delay (us) between replicas and proxies


================================================
FILE: configs/local/nezha-replica-config-0.yaml
================================================
---
print-config: true
replica-ips:
  - "127.0.0.1"
  - "127.0.0.2"
  - "127.0.0.3"
endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
replica-id: 0
receiver-shards: 1 # The number of threads to receive threads
record-shards: 1 # The number of threads to record requests in the global concurrent map
track-shards: 1 # The number of threads to record synced log entries
reply-shards: 2 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
index-sync-period-us: 50 # For every period, the leader replica will send a index sync message to followers, serves for slow path and heartbeat
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-batch: 30
request-key-transfer-batch: 60
request-transfer-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # When commutativity is enabled for test, we need the specified key number to detect commutativity
moving-percentile: 0.50 # the percentile used to estimate owd
owd-estimation-window: 1000 # We use a sliding window to get moving median of one-way delays, here is the length of the window
reclaim-timeout-ms: 5000 # To save memory, those requests who enter the late buffer will not stay there forever: if they have stayed for so long, they will be reclaimed. Similary, for unsynced log entries, if (1) they have been kept for more than reclaim-timeout-ms (2) they are not used by the worker threads; then they will be reclaimed


================================================
FILE: configs/local/nezha-replica-config-1.yaml
================================================
---
print-config: true
replica-ips:
  - "127.0.0.1"
  - "127.0.0.2"
  - "127.0.0.3"
endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
replica-id: 1
receiver-shards: 1 # The number of threads to receive threads
record-shards: 1 # The number of threads to record requests in the global concurrent map
track-shards: 1 # The number of threads to record synced log entries
reply-shards: 2 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
index-sync-period-us: 50 # For every period, the leader replica will send a index sync message to followers, serves for slow path and heartbeat
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-batch: 30
request-key-transfer-batch: 60
request-transfer-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # When commutativity is enabled for test, we need the specified key number to detect commutativity
moving-percentile: 0.50 # the percentile used to estimate owd
owd-estimation-window: 1000 # We use a sliding window to get moving median of one-way delays, here is the length of the window
reclaim-timeout-ms: 5000 # To save memory, those requests who enter the late buffer will not stay there forever: if they have stayed for so long, they will be reclaimed. Similary, for unsynced log entries, if (1) they have been kept for more than reclaim-timeout-ms (2) they are not used by the worker threads; then they will be reclaimed


================================================
FILE: configs/local/nezha-replica-config-2.yaml
================================================
---
print-config: true
replica-ips:
  - "127.0.0.1"
  - "127.0.0.2"
  - "127.0.0.3"
endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
replica-id: 2
receiver-shards: 1 # The number of threads to receive threads
record-shards: 1 # The number of threads to record requests in the global concurrent map
track-shards: 1 # The number of threads to record synced log entries
reply-shards: 2 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
index-sync-period-us: 50 # For every period, the leader replica will send a index sync message to followers, serves for slow path and heartbeat
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-batch: 30
request-key-transfer-batch: 60
request-transfer-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # When commutativity is enabled for test, we need the specified key number to detect commutativity
moving-percentile: 0.50 # the percentile used to estimate owd
owd-estimation-window: 1000 # We use a sliding window to get moving median of one-way delays, here is the length of the window
reclaim-timeout-ms: 5000 # To save memory, those requests who enter the late buffer will not stay there forever: if they have stayed for so long, they will be reclaimed. Similary, for unsynced log entries, if (1) they have been kept for more than reclaim-timeout-ms (2) they are not used by the worker threads; then they will be reclaimed


================================================
FILE: configs/nezha-client-config-template.yaml
================================================
---
print-config: true
proxy-info:
  proxy-ips:
    - "127.0.0.4"
  proxy-shards: 12
  request-port-base: 32000
client-info:
  endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
  client-id: 1
  client-ip: "127.0.0.5"
  request-port: 32912
  is-openloop: true
  poisson-rate: 60 # it means the client sends x reqs/10ms on average (should be larger than 10, otherwise, the submission rate is not accurate)
  duration-sec: 60 # it means the duration of the client runs (second)
  key-num: 1000000 # when key-num is 1, it means there is no commutativity optimization
  skew-factor: 0.5 # 0-0.99 The higher the zipfian factor is, the more skewed the workload is
  write-ratio: 0.5 # 0-1, the ratio of write requests
  request-retry-time-us: 100000 # After the request is submitted, if we cannot get the response after such long time, then we will retry


================================================
FILE: configs/nezha-proxy-config-template.yaml
================================================
---
print-config: true
# Replica Info
replica-info:
  replica-ips:
    - "127.0.0.1"
    - "127.0.0.2"
    - "127.0.0.3"
  receiver-shards: 2 # The number of threads to receive threads
  receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
  initial-owd: 80 # The initial one-way delay (us) between replicas and proxies
# Proxy Info
proxy-info:
  endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
  proxy-id: 1
  proxy-ip: "127.0.0.4"
  shard-num: 12
  max-owd: 200
  request-port-base: 32000
  reply-port-base: 33000


================================================
FILE: configs/nezha-replica-config-template.yaml
================================================
---
print-config: true
replica-ips:
  - "127.0.0.1"
  - "127.0.0.2"
  - "127.0.0.3"
endpoint-type: 1 # 1 for UDP Endpoint, 2 for GRPC Endpoint [to be supported soon]
replica-id: 0
receiver-shards: 2 # The number of threads to receive threads
record-shards: 1 # The number of threads to record requests in the global concurrent map
track-shards: 1 # The number of threads to record synced log entries
process-shards: 1 # The number of threads to process requests. For now process-shards is fixed to 1, because the early-buffer enque/deque is hard to parallelize. Maybe later we can find a high-performant **concurrent priority queue** for early-buffer, then process-shards may be parallelized for higher performance
reply-shards: 3 # The number of threads to send replies (both fast/slow replies)
index-sync-shards: 1 # The number of threads used by the leader to broadcast index synchronization messages to followers. For followers, they only need one such thread to receive and handle the index sync msgs
receiver-port: 33333 # The port is exposed to client/proxy and is used to receive client/proxy requests. When receiver-shards>1, the corresponding ports are receiver-ports + shard-index
index-sync-port: 35333 # The port is used for replicas to communicate index synchronization messages. Typicall, the leader sends index sync msgs and the followers receive them
index-sync-period-us: 50 # For every period, the leader replica will send a index sync message to followers, serves for slow path and heartbeat
request-ask-port: 36333 # When one follower is missing requests, they ask from the leader or other followers. The leader/followers send the missed requests to this port of this follower
index-ask-port: 37333 # When one follower is missing some index sync msgs, it asks the leader (or other followers) for the missing ones, and the leader (or other followers) send to the index-sync-port of the problematic follower
master-port: 34333 # This port is mainly used to send/receive other messages
monitor-period-ms: 50 # Each blocking thread has a monitor timer, which periodically checks the replica status and unblock the thread (and exit) when replica status is not Normal
main-loop-period-ms: 20
heartbeat-threshold-ms: 500
index-ask-period-ms: 10 # When missing indices, we launch a timer to periodically ask indices
request-ask-period-ms: 10 # When missing logs, we launch a timer to periodically ask logs(requests)
view-change-period-ms: 10
state-transfer-period-ms: 10
state-transfer-timeout-ms: 100000 # If the state-transfer has not been completed after such long time, then rollback to viewchange
index-transfer-batch: 30
request-key-transfer-batch: 60
request-transfer-batch: 5
crash-vector-request-period-ms: 50 # make it longer, becasue it will clear all previous replies
recovery-request-period-ms: 10
sync-report-period-ms: 10
key-num: 1000000 # When commutativity is enabled for test, we need the specified key number to detect commutativity
moving_percentile: 0.90 # the percentile used to estimate owd
owd-estimation-window: 1000 # We use a sliding window to get moving median of one-way delays, here is the length of the window
reclaim-timeout-ms: 5000 # To save memory, those requests who enter the late buffer will not stay there forever: if they have stayed for so long, they will be reclaimed. Similary, for unsynced log entries, if (1) they have been kept for more than reclaim-timeout-ms (2) they are not used by the worker threads; then they will be reclaimed


================================================
FILE: docs/Nezha.tla
================================================

`^\textbf{\large N  TLA+ Specification}\\^' 
 
------------------------------ MODULE Nezha ----------------------------------

EXTENDS Naturals, TLC, FiniteSets, Sequences

--------------------------------------------------------------------------------
(* `^\textbf{\large Bounds for Model Check [Configurable]}^' *)

\* Time Range [Configurable]
MaxTime == 3

\* Each client is only allowed to submit MaxReqNum requests [Configurable]
\* In the specification, we will only consider two roles, client and replicas
\* (i.e. it can be considered as co-locating one proxy with one client)
\* For the proxy-based design, we just need to replace client with proxy, 
\* and then the specification describes the interaction between proxy and replicas
MaxReqNum == 1 

\* The leader is only allowed to crash when the view < MaxViews [Configurable]
MaxViews == 3

\* These variables are used to implment at-most-once primitives
\* i.e. The variables record the messages processed by Replicas/Clients, so 
\* that the Replicas/Clients will not process twice
VARIABLE  vReplicaProcessed, \* Messages that have been processed by replicas
          vClientProcessed \* Messages that have been processed by clients

VARIABLE DebugAction

(* `^\textbf{\large Constants}^' *)

\* The set of replicas and an ordering of them
CONSTANTS Replicas, ReplicaOrder, Clients, LatencyBounds
ASSUME IsFiniteSet(Replicas) 
ASSUME ReplicaOrder \in Seq(Replicas)


F == (Cardinality(Replicas) - 1) \div 2
ceilHalfF == IF (F \div 2) * 2 = F THEN F \div 2 ELSE (F+1) \div 2
floorHalfF == F \div 2
QuorumSize == F + 1
FastQuorumSize == F + ceilHalfF + 1
RecoveryQuorumSize == ceilHalfF + 1
FastQuorums == {R \in SUBSET(Replicas) : Cardinality(R) >= FastQuorumSize }
Quorums == {R \in SUBSET(Replicas) : Cardinality(R) * 2 > Cardinality(Replicas)}   

\* Replica Statuses
StNormal == 1
StViewChange == 2
StRecovering == 3

\* Message Types
MClientRequest == 1 \* Sent by client to replicas
MFastReply == 2 \* Fast Reply Message
MSlowReply == 3 \* Slow Reply Message
MLogIndex == 4  \* LogIndex
MLogEntry == 5  \* Log entry, different from index, it includes command field, which can be large in practice
MIndexSync == 6 \* Sync message during the index sync process
MMissEntryRequest == 7 \* Sent by followers once they fail to find the entry on itself
MMissEntryReply == 8  \* Response to MMissEntryRequest, providing the missing entries

MViewChangeReq == 9       \* Sent when leader/sequencer failure detected
MViewChange == 10        \* Sent to ACK view change
MStartView == 11           \* Sent by new leader to start view

\* The following messages are mainly used for periodic sync
\* Just as described in NOPaxos, it is an optional optimization to enable fast recovery after failure
MSyncPrepare == 12         \* Sent by the leader to ensure log durability
MSyncRep == 13             \* Sent by followers as ACK
MSyncCommit == 14           \* Sent by leaders to indicate stable log

\* The following messages are mainly used for replica recovery
MCrashVectorReq == 15
MCrashVectorRep == 16
MRecoveryReq == 17
MRecoveryRep == 18
MStateTransferReq == 19
MStateTransferRep == 20
      

(*
  `^\textbf{Message Schemas}^'

  ViewIDs == [ leaderNum |-> n \in (1..) ]

  \* <clientID, requestID> uniquely identifies one request on one replica
  \* But across replicas, the same <clientID, requestID> may have different deadlines
  \* (the leader may modify the deadline to make the request eligible to enter the early-buffer)
  \* so <deadline, clientID, reqID> uniquely identifes one request across replicas 

  ClientRequest
      [ mtype       |-> MClientRequest,
        sender      |-> c \in Clients,
        dest        |-> r \in Replicas,
        requestID   |-> i \in (1..), 
        command     |-> "", 
        s           |-> t \in (1..MaxTime), 
        l           |-> l \in (1..MaxBound)
      ]
  
  \* logSlotNum is not necessary and it is not described in the paper
  \* Here we include logSlotNum in FastReply and SlowReply messages
  \* to facilitate the check of Linearizability invariant
  FastReply
      [ mtype      |-> MFastReply,
        sender     |-> r \in Replicas,
        dest       |-> c \in Clients,
        viewID     |-> v \in ViewIDs,
        requestID  |-> i \in (1..vClientReqNum)
        hash       |-> [
                        log |-> vLogs[1..n], 
                        cv |-> crashVector
                       ] 
        deadline   |-> i \in (1..MaxTime+MaxBound),
        logSlotNum |-> n \in (1..)
      ]

  SlowReply
      [ mtype      |-> MSlowReply,
        sender     |-> r \in Replicas,
        dest       |-> c \in Clients,
        viewID     |-> v \in ViewIDs,
        requestID  |-> i \in (1..vClientReqNum)
        logSlotNum |-> n \in (1..)
      ]
      
  LogIndex
      [ mtype      |-> MLogIndex,
        clientID   |-> c \in Clients,
        requestID  |-> i \in (1..vClientReqNum),
        deadline   |-> i \in (1..MaxTime+MaxBound),
      ]
      
  LogEntry
      [ mtype      |-> MLogEntry,
        clientID   |-> c \in Clients,
        requestID  |-> i \in (1..vClientReqNum),
        deadline   |-> i \in (1..MaxTime+MaxBound),
        command    |-> ""
      ]
      
  IndexSync
      [ mtype      |-> MIndexSync,
        sender     |-> r \in Replicas,
        dest       |-> c \in Clients,
        viewID     |-> v \in ViewIDs,
        logindcies |-> index \in vLogs[leaderIdx]
      ]

   MMissEntryRequest
      [ mtype      |-> MMissEntryRequest,
        sender     |-> r \in Replicas,
        dest       |-> d \in Replicas,
        viewID     |-> v \in ViewIDs,
        miss       |-> {log indices}
      ]

   MMissEntryRequest
      [ mtype      |-> MMissEntryReply,
        sender     |-> r \in Replicas,
        dest       |-> d \in Replicas,
        viewID     |-> v \in ViewIDs,
        entries    |-> {log entries}
      ]
      
  ViewChangeReq
      [ mtype  |-> MViewChangeReq,
        sender |-> r \in Replicas,
        dest   |-> r \in Replicas,
        viewID |-> v \in ViewIDs,
        cv     |-> crash vector 
      ]

  ViewChange
      [ mtype      |-> MViewChange,
        sender     |-> r \in Replicas,
        dest       |-> r \in Replicas,
        viewID     |-> v \in ViewIDs,
        lastNormal |-> v \in ViewIDs,
        log        |-> l \in vLogs[1..n],
        cv         |-> crash vector  
      ]

  StartView
      [ mtype      |-> MStartView,
        dest       |-> r \in Replicas,
        viewID     |-> v \in ViewIDs,
        log        |-> l \in vLogs[1..n],
        cv         |-> crash vector 
      ]


  SyncPrepare
      [ mtype      |-> MSyncPrepare,
        dest       |-> r \in Replicas,
        sender     |-> r \in Replicas,
        viewID     |-> v \in ViewIDs,
        log        |-> l \in vLogs[1..n] ]

  SyncRep
      [ mtype         |-> MSyncRep,
        dest          |-> r \in Replicas,
        sender        |-> r \in Replicas,
        viewID        |-> v \in ViewIDs,
        logSlotNumber |-> n \in (1..) ]

  SyncCommit
      [ mtype         |-> MSyncCommit,
        dest          |-> r \in Replicas,
        sender        |-> r \in Replicas,
        viewID        |-> v \in ViewIDs,
        log           |-> l \in vLogs[1..n] ]
        
  CrashVectorReq
      [ mtype         |-> MCrashVectorReq,
        sender        |-> r \in Replicas,
        dest          |-> r \in Replicas,
        nonce         |-> nonce
      ] 
  CrashVectorRep
      [ mtype         |-> MCrashVectorRep,
        sender        |-> r \in Replicas,
        dest          |-> r \in Replicas,
        nonce         |-> nonce,
        cv            |-> vector of counters
      ] 
      
  RecoveryReq
      [ mtype         |-> MRecoveryReq,
        sender        |-> r \in Replicas,
        dest          |-> r \in Replicas,
        cv            |-> vector of counters
      ]  
      
  RecoveryRep
      [ mtype         |-> MRecoveryRep,
        sender        |-> r \in Replicas,
        dest          |-> r \in Replicas,
        viewID        |-> v \in ViewIDs,
        cv            |-> vector of counters
      ]           

  StateTransferReq
      [ mtype         |-> MStateTransferReq,
        sender        |-> r \in Replicas,
        dest          |-> r \in Replicas,
        cv            |-> vector of counters
      ]  
  StateTransferRep
      [ mtype         |-> MStateTransferRep,
        sender        |-> r \in Replicas,
        dest          |-> r \in Replicas,
        viewID        |-> v \in ViewIDs,
        log           |-> l \in vLogs[1..n] ],
        cv            |-> vector of counters
      ]  
*)

--------------------------------------------------------------------------------
(* `^\textbf{\large Variables}^' *)

\* `^\textbf{Network State}^'
VARIABLE messages \* Set of all messages sent

networkVars      == << messages >>
InitNetworkState == messages = {}

\* Used as a dummy value
NULLLog == [    deadline        |-> 0, 
                clientID    |-> 0,  
                requestID   |-> 0
           ]


\* `^\textbf{Replica State}^'
VARIABLES vLog,            \* Log of values
          vEarlyBuffer,    \* The early buffer to hold request,
                           \* and release it after clock passes its deadline (s+l)
          vReplicaStatus,  \* One of StNormal, StViewChange, StRecovering
          vViewID,         \* Current viewID replicas recognize
          vReplicaClock,   \* Current Time of the replica
          vLastNormView,   \* Last views in which replicas had status StNormal
          vViewChanges,    \* Used for logging view change votes
          vSyncPoint,      \* Latest synchronization point, 
                           \* to which the replica state (vLog) is consistent with the leader.
          vLateBuffer,     \* The late buffer Used to store the requests 
                           \* which are not eligible to enter vEarlyBuffer 
          
          vTentativeSync,  \* Used by leader to mark current syncPrepare point (during periodic sync process)
                           \* (Actually, vSyncPoint and vTentativeSync can be merged into one Var
                           \* However, we decouple them to make the spec easy to understand)
          vSyncReps,       \* Used for logging sync reps at leader 
          vCommitPoint,    \* Different from vSyncPoint, 
                           \* vCommitPoint indicates that the logs before this point has been replicated to majority
                           \* So followers can safely execute requests (log entries) up to vCommitPoint
                           \* Refer to ``Acceleration of Recovery" para in Sec 6
                           
          vUUIDCounter,    \* Locally unique string (for CrashVectorReq)
          vCrashVector,    \* CrashVector, initialized as all-zero vector
          vCrashVectorReps,\* CrashVectorRep Set
          vRecoveryReps    \* RecoveryRep Set
          
replicaVars      == << vLog, vEarlyBuffer, 
                       vViewID, vReplicaClock,
                       vLastNormView, vViewChanges,vReplicaStatus,
                       vSyncPoint, vLateBuffer,
                       vTentativeSync, vSyncReps, vCommitPoint, 
                       vUUIDCounter, vCrashVector, 
                       vCrashVectorReps, vRecoveryReps>>

InitReplicaState ==
  /\ vLog            = [ r \in Replicas |-> << >> ]
  /\ vEarlyBuffer    = [ r \in Replicas |-> {} ]
  /\ vViewID         = [ r \in Replicas |-> 1 ]  \* 0 should also be okay
  /\ vReplicaClock   = [ r \in Replicas |-> 1 ]
  /\ vLastNormView   = [ r \in Replicas |-> 1 ]
  /\ vViewChanges    = [ r \in Replicas |-> {} ]
  /\ vReplicaStatus  = [ r \in Replicas |-> StNormal ]
  /\ vSyncPoint      = [ r \in Replicas |-> 0 ]
  /\ vLateBuffer     = [ r \in Replicas |-> {} ]
  /\ vTentativeSync  = [ r \in Replicas |-> 0 ]
  /\ vSyncReps       = [ r \in Replicas |-> {} ]
  /\ vCommitPoint    = [ r \in Replicas |-> 0 ]
  /\ vCrashVector    = [ r \in Replicas |-> [ rr \in Replicas |-> 0] ]
  /\ vCrashVectorReps= [ r \in Replicas |-> {} ]
  /\ vRecoveryReps   = [ r \in Replicas |-> {} ]
  /\ vUUIDCounter    = [ c \in Replicas  |-> 0 ]

\* `^\textbf{Client State}^'
VARIABLES   vClientClock,   \* Current Clock Time of the client
            vClientReqNum   \* The number of requests that have been sent by this client

InitClientState  ==
  /\ vClientClock    = [ c \in Clients  |-> 1 ]
  /\ vClientReqNum   = [ c \in Clients  |-> 0 ]

clientVars          == << vClientClock, vClientReqNum >>

\* `^\textbf{Set of all vars}^'
vars == << networkVars, replicaVars, clientVars >>

\*\* `^\textbf{Initial state}^'
Init == /\ InitNetworkState
        /\ InitReplicaState
        /\ InitClientState
        /\ vReplicaProcessed = [ r \in Replicas |-> {} ]
        /\ vClientProcessed = [c \in Clients |-> {}]
        /\ DebugAction = <<"Init", "">>

--------------------------------------------------------------------------------
(* `^\textbf{\large Helpers}^' *)

NumofReplicas(status) == Cardinality({ r \in Replicas: vReplicaStatus[r] = status }) 

DuplicateRep(ReplySet,m) == m.sender \in { mm.sender : mm \in ReplySet } 

Pick(S) == CHOOSE s \in S : TRUE
                              
\* Convert a Set to Sequence
RECURSIVE Set2Seq(_)
Set2Seq(S) == IF Cardinality(S) = 0 THEN <<>>
          ELSE
          LET
            x == CHOOSE x \in S : TRUE
          IN
            <<x>> \o Set2Seq(S \ {x})

\* Convert a Sequence to Set
Seq2Set(seq) ==  { seq[i] : i \in DOMAIN seq }
       
Max(S) == CHOOSE x \in S : \A y \in S : x >= y

Min(S) == CHOOSE x \in S : \A y \in S : x <= y

\* `^\textbf{View ID Helpers}^'
LeaderID(viewID) == (viewID % Len(ReplicaOrder)) + (IF viewID >= Len(ReplicaOrder) THEN 1 ELSE 0)

Leader(viewID) == ReplicaOrder[LeaderID(viewID)]  \* remember <<>> are 1-indexed                             


\* `^\textbf{Log Manipulation Helpers}^'

\* The order of 2 log entries are decided by the tuple <deadline, clientID, requestID>
\* Usually, deadline makes the two entries comparable
\* When 2 different entries have the same deadline, the tie is broken with clientID
\* Further, the tie is broken is requestID 
\* (unnecessary if we only allow client to submit one request at one tick) 
EntryLeq(l1, l2)      == /\ l1.deadline <= l2.deadline
                         /\ l1.clientID <= l2.clientID
                         /\ l1.requestID <= l2.requestID
                         
EntryEq(l1, l2)       == /\ l1.deadline = l2.deadline  
                         /\ l1.clientID = l2.clientID
                         /\ l1.requestID = l2.requestID

EntryLessThan(l1, l2) == /\ EntryLeq(l1, l2)
                         /\ ~(EntryEq(l1, l2))
                            
\* Find entry in one replica's log (<clientID, reqID> can uniquely identify the log entry)
\* We do not check deadline, because the leader may have modified the request's deadline
\* Return 0 when we fail to find it (remember Sequence is 1-indexed in TLA+, so 0 can serve as a dummy value)
FindEntry(clientID, reqID, log) == 
                         LET 
                            entryIndexSet == { i \in 1..Len(log): /\ log[i].clientID = clientID
                                                                  /\ log[i].reqID = reqID }
                          IN
                            IF Cardinality(entryIndexSet) = 0 THEN 
                                0
                            ELSE
                                Pick(entryIndexSet)
                                

SortLogSeq(seq) == SortSeq(seq, LAMBDA x, y: EntryLessThan(x, y) )

\* Given a set of logs, return the sorted log list
GetSortLogSeq(S) == LET
                        seq == Set2Seq(S)
                    IN
                        SortLogSeq(seq)
                        
                            
(* Merge logs, first put all log items together, deduplicated (i.e. UNION them into a set).
   Then, do filtering and only keep those that have appeared in at least
   `^\left \lceil{f/2}\right \rceil +1^' replicas. *)

CountVotes(logll, x) ==  Cardinality({ logSet \in logll : x \in logSet })

MergeUnSyncLogs(unSyncedLogs, lastSyncedLog) == 
        LET 
            unSyncedLogSet == UNION unSyncedLogs
            votedLogSet == {x \in unSyncedLogSet : 
                               /\ EntryLessThan(lastSyncedLog, x)
                               /\ CountVotes(unSyncedLogs, x) >= RecoveryQuorumSize}
        IN
            GetSortLogSeq(votedLogSet)
            
\* `^\textbf{Network Helpers}^'
\* Add a message to the network
Send(ms) == messages' = messages \cup ms

\* Convert the request format to a log format (by summing up s and l to get deadline)
Req2Log(req) == [   mtype       |-> MLogEntry,
                    deadline    |-> req.s + req.l, 
                    clientID    |-> req.sender,
                    requestID   |-> req.requestID,
                    command     |-> req.command
                ]
\* Index does not need to include command field, which is the body of the request/log, and can be very large
GetLogIndex(entry) == [ 
                    mtype       |-> MLogIndex, 
                    deadline    |-> entry.deadline,
                    clientID    |-> entry.clientID,
                    requestID   |-> entry.requestID
                ]

GetLogIndexFromReply(reply) == [
                    mtype       |-> MLogIndex, 
                    deadline    |-> reply.deadline,
                    clientID    |-> reply.dest,
                    requestID   |-> reply.requestID
                ]


IndexEq(index, msg) == /\ index.deadline = msg.deadline
                       /\ index.clientID = msg.clientID
                       /\ index.requestID = msg.requestID

\* Add local time to the message (for easy debug)
Msg2RLog(msg, r) == msg @@ [tl |-> vReplicaClock[r]]


LastLog(logList) == IF Len(logList) = 0 THEN NULLLog ELSE  logList[Len(logList)]     

MergeCrashVector(cv1, cv2)== [ r \in Replicas |-> Max({cv1[r], cv2[r]}) ]

CheckCrashVector(m, r) == 
    IF m.cv[m.sender] < vCrashVector[r][m.sender] THEN
        FALSE \* Potential stray message
    ELSE 
        vCrashVector' = [ vCrashVector  EXCEPT ![r] = MergeCrashVector(m.cv, vCrashVector[r])]
                               
FilterStrayMessage(MSet, cv) == {m \in MSet : m.cv[m.sender] >= cv[m.sender] }
    
    
--------------------------------------------------------------------------------
(* `^\textbf{\large Message Handlers and Actions }^' *)

\* `^\textbf{Client action}^'
\* Client c sends a request
\* We assume client can only send one request in one tick of time
\* If time has reached the bound, this client cannot send request any more

ClientSendRequest(c) ==     /\ vClientClock[c] < MaxTime 
                            /\ vClientReqNum[c] < MaxReqNum
                            /\ Send({[ mtype |-> MClientRequest,
                                       sender       |-> c, \* clientID
                                       requestID    |-> vClientReqNum[c] + 1, \* requestID
                                       command      |-> "",
                                       s            |-> vClientClock[c], \* submission time
                                       l            |-> LatencyBounds[c], \* latency bound
                                       dest         |-> r
                                      ]: r \in Replicas })
                            /\ vClientClock' = [ vClientClock EXCEPT ![c] = vClientClock[c] + 1 ]
                            /\ vClientReqNum' = [ vClientReqNum EXCEPT ![c] = vClientReqNum[c] +1 ]
                            /\ UNCHANGED  << replicaVars >>  
                                                    

Duplicate(entry, logSet) == 
  LET
       findSet == {x \in logSet : /\ x.clientID = entry.clientID 
                                  /\ x.requestID = entry.requestID }
  IN
       Cardinality(findSet) > 0
    
\* Replica r receives MClientRequest, m
HandleClientRequest(r, m) ==
  LET
    mlog == Req2Log(m)
  IN
  \* If the request is duplicate, it will no longer be appended to the log
  \* Replicas simply reply the previous execution result of this request 
  \* (we do not model execution in this spec)
  /\ ~Duplicate(mlog, Seq2Set(vLog[r]) \union vEarlyBuffer[r] )
  /\ vReplicaStatus[r] = StNormal
     \* The request can enter the early buffer
  /\ \/ /\ EntryLessThan(LastLog(vLog[r]), mlog)
        /\ vEarlyBuffer' = [ 
                vEarlyBuffer EXCEPT ![r] =vEarlyBuffer[r] \cup { mlog } 
           ]
        /\ UNCHANGED   << networkVars, clientVars, 
                       vLog, vViewID, vReplicaClock,
                       vLastNormView, vViewChanges,vReplicaStatus,
                       vSyncPoint,  vLateBuffer, 
                       vTentativeSync, vSyncReps, vCommitPoint,
                       vUUIDCounter, vCrashVector, 
                       vCrashVectorReps, vRecoveryReps >> 
     \* (1) Followers' early buffers do not accept the request 
     \*     if its deadline is smaller than previously appended (last released) entry,
     \*     so followers directly put the request into the late buffer
     \* (2) Leader modifies its deadline to be larger than the last released entry
     \*     so as to make it eligible for entering the early buffer
     \/ /\ EntryLessThan(mlog, LastLog(vLog[r]))
        /\ IF   r = Leader(vViewID[r])  THEN \* this replica is the leader in the current view
                /\ vEarlyBuffer' = [ 
                        vEarlyBuffer EXCEPT ![r] =vEarlyBuffer[r] \cup {[
                            mtype      |-> MLogEntry,
                            clientID   |-> mlog.clientID,
                            requestID  |-> mlog.requestID,
                            deadline   |-> LastLog(vLog[r]).deadline + 1,
                            command    |-> mlog.command
                        ]} 
                   ]
                /\ UNCHANGED   << networkVars, clientVars, 
                                vLog, vViewID, vReplicaClock,
                                vLastNormView, vViewChanges,vReplicaStatus,
                                vSyncPoint, vLateBuffer, 
                                vTentativeSync, vSyncReps, vCommitPoint, 
                                vUUIDCounter, vCrashVector, 
                                vCrashVectorReps, vRecoveryReps >> 
           ELSE \* this replica is a follower in the current view
                /\ vLateBuffer' = [ 
                        vLateBuffer EXCEPT ![r] =vLateBuffer[r] \cup { mlog } 
                   ]

                /\ UNCHANGED   << networkVars, clientVars, 
                               vLog, vEarlyBuffer, vViewID, vReplicaClock,
                               vLastNormView, vViewChanges,vReplicaStatus,
                               vSyncPoint, vTentativeSync, 
                               vSyncReps, vCommitPoint,  
                               vUUIDCounter, vCrashVector, 
                               vCrashVectorReps, vRecoveryReps >> 

                                              
\* Release relevant requests from vEarlyBuffer and append to vLog, 
\* and then send a fast reply
FlushEarlyBuffer(r) ==
    LET 
       validLogSet == {x \in vEarlyBuffer[r]: 
                         /\ x.deadline < vReplicaClock[r] \* < rather than <= 
                         /\ EntryLessThan(LastLog(vLog[r]), x) }
       validLogs == GetSortLogSeq(validLogSet)
       newLogStart == Len(vLog[r]) + 1
    IN
    /\  vLog' = [vLog EXCEPT ![r] = vLog[r] \o validLogs ]
    /\  vEarlyBuffer' = [ vEarlyBuffer EXCEPT ![r] 
                        = {x \in vEarlyBuffer[r]: x.deadline >= vReplicaClock[r] } ] \* >= rather than >
    /\  Send({[ mtype       |-> MFastReply,
                sender      |-> r,
                dest        |-> vLog'[r][i].clientID,
                viewID      |-> vViewID[r],
                requestID   |-> vLog'[r][i].requestID,
                hash        |-> [
                                 log |-> SubSeq(vLog'[r], 1, i),
                                 cv  |-> vCrashVector
                                 ],
                deadline    |-> vLog'[r][i].deadline,
                logSlotNum  |-> i
               ] : i \in newLogStart..Len(vLog'[r])})
    /\  IF r = Leader(vViewID[r])  THEN 
            /\ vSyncPoint' =  [ vSyncPoint EXCEPT ![r] = Len(vLog'[r]) ]
            /\ UNCHANGED   <<  clientVars, vViewID, vLastNormView, vViewChanges,
                               vReplicaStatus, vReplicaClock, vLateBuffer,
                               vTentativeSync, vSyncReps, vCommitPoint,
                               vUUIDCounter, vCrashVector, 
                               vCrashVectorReps, vRecoveryReps >> 
        ELSE
            UNCHANGED   << clientVars, vViewID, vLastNormView, vViewChanges,
                           vReplicaStatus, vReplicaClock, 
                           vSyncPoint, vLateBuffer, 
                           vTentativeSync, vSyncReps, vCommitPoint,
                           vUUIDCounter, vCrashVector, 
                           vCrashVectorReps, vRecoveryReps  >> 

\* Clock can be random value (RandomElement(1..MaxTime)),
\* because clock sync algorithm can give negative offset, or even fails 
\* But Nezha depend on clock for performance but not for correctness                               
\* If the replica clock goes beyond MaxTime, it will stop processing
\* Since Clock is moved, then replicas can release relevant requests and append to logs                       
ReplicaClockMove(r) ==/\ IF vReplicaClock[r] < MaxTime THEN
                            vReplicaClock' =  [ 
                                vReplicaClock EXCEPT ![r] = RandomElement(1..MaxTime)
                            ]
                         ELSE  
                            UNCHANGED vReplicaClock
                      /\ UNCHANGED << networkVars, clientVars, 
                                      vLog, vEarlyBuffer,vViewID, 
                                      vLastNormView, vViewChanges, vReplicaStatus,
                                      vSyncPoint, vLateBuffer, vTentativeSync,
                                      vSyncReps,vCommitPoint,
                                      vUUIDCounter, vCrashVector, 
                                      vCrashVectorReps, vRecoveryReps >>
\* Client clock move does not change any other things
ClientClockMove(c) == /\  IF vClientClock[c] < MaxTime THEN
                            vClientClock' = [
                                vClientClock EXCEPT ![c] = RandomElement(1..MaxTime)
                            ]
                          ELSE
                            UNCHANGED vClientClock
                      /\  UNCHANGED <<networkVars, replicaVars, vClientReqNum>>

                      
--------------------------------------------------------------------------------
\* `^\textbf{\large Index Synchronization to Fix Set Inequality}^'

\* Leader replica r starts index synchronization
StartIndexSync(r) ==
  LET 
    indices == { GetLogIndex(vLog[r][i]) : i \in 1..Len(vLog[r]) }
  IN
  /\ r = Leader(vViewID[r])
  /\ vReplicaStatus[r]  = StNormal
  /\ Cardinality(indices) > 0  \* leader has log entries to sync
  /\ Send({[ mtype      |-> MIndexSync,
             sender     |-> r,
             dest       |-> d,
             viewID     |-> vViewID[r],
             logindcies |-> indices ] : d \in Replicas })
  /\ UNCHANGED << clientVars, replicaVars >>

                       
GetSyncLogs(logSeq, indices) == 
    LET
        logSet == { l \in Seq2Set(logSeq) : \E index \in indices: EntryEq(index, l)}
    IN
        GetSortLogSeq(logSet)

GetUnSyncLogs(logSeq, lastSyncedLog) == 
    LET
        logSet == { l \in Seq2Set(logSeq) : EntryLessThan(lastSyncedLog, l) }
    IN
        GetSortLogSeq(logSet)
        
\* Replica r receives IndexSync message, m
HandleIndexSync(r, m) ==
  /\ r /= Leader(vViewID[r])
  /\ vReplicaStatus[r] = StNormal
  /\ m.viewID = vViewID[r]
  /\ m.sender = Leader(vViewID[r])
  /\ vSyncPoint[r] < Len(m.logindcies)
  /\ LET 
        entries == { vLog[r][i] : i \in 1..Len(vLog[r]) }
        indices == { GetLogIndex(vLog[r][i]) : i \in 1..Len(vLog[r]) }
        missedEntries == m.indices \ indices
     IN
        \* Missing some log entries -> Send MMissEntryRequest
        IF Cardinality(missedEntries) > 0 THEN
            /\ Send({[  mtype      |-> MMissEntryRequest,
                        sender     |-> r,
                        dest       |-> d,
                        viewID     |-> vViewID[r],
                        miss       |-> missedEntries ] : d \in (Replicas \ {r} ) })
            /\ UNCHANGED << vLog, vSyncPoint >>
        \* No missing entries, update vLog and vSyncPoint, and send relevant slow replies
        ELSE
            LET 
                syncLogs ==  GetSyncLogs(vLog[r], indices)
                unsyncLogs ==  GetUnSyncLogs(vLog[r], LastLog(syncLogs))
            IN
            /\ vLog' = [ vLog EXCEPT ![r] = syncLogs \o unsyncLogs ]
            /\ vSyncPoint' = [ vSyncPoint EXCEPT ![r] = Len(syncLogs) ]
            /\ Send({[   mtype      |-> MSlowReply,
                         sender     |-> r,
                         dest       |-> vLog'[r][i].clientID,
                         viewID     |-> vViewID[r],
                         requestID  |-> vLog'[r][i].requestID,
                         logSlotNum |-> i ] : i \in (1..Len(syncLogs))})
            
  /\ UNCHANGED << clientVars, vEarlyBuffer, vViewID, vReplicaClock, 
                 vLastNormView, vViewChanges,  vReplicaStatus, 
                 vLateBuffer, vTentativeSync, vSyncReps, vCommitPoint,
                 vUUIDCounter, vCrashVector, 
                 vCrashVectorReps, vRecoveryReps>>


FindEntries(log, indices)== 
    { l \in Seq2Set(log)  : \E x \in indices: IndexEq(l,x) }

\* Replica r receives a request from other replicas, asking for a missing log entry
HandleMissEntryRequest(r, m) == 
  /\ m.viewID = vViewID[r]
  /\ LET 
        findentries == FindEntries(vLog[r], m.miss)
     IN
     /\ Cardinality(findentries) > 0
     /\ Send({[   mtype      |-> MMissEntryReply,
                  sender     |-> r,
                  dest       |-> m.sender,
                  viewID     |-> vViewID[r],
                  entries    |-> findentries ]})
     /\ UNCHANGED << clientVars, replicaVars >>  
       

\* Replica r receives a reply from other replicas, providing the missing entries
HandleMissEntryReply(r, m) == 
    /\ m.viewID = vViewID[r]
    /\ LET
        mergedSet == Seq2Set(vLog[r]) \union m.entries 
       IN
        vLog' = [ vLog EXCEPT ![r] = GetSortLogSeq(mergedSet) ]
    /\ UNCHANGED << networkVars, clientVars, 
                    vEarlyBuffer,vViewID, vReplicaClock, 
                    vLastNormView, vViewChanges, vReplicaStatus,
                    vSyncPoint, vLateBuffer, 
                    vTentativeSync,vSyncReps, vCommitPoint,
                    vUUIDCounter, vCrashVector, 
                    vCrashVectorReps, vRecoveryReps >>

                       
--------------------------------------------------------------------------------
\* `^\textbf{\large Replica Rejoin}^'
\* Failed replica loses all states
StartReplicaFail(r) == 
    /\ NumofReplicas(StRecovering) < F \* We assume at most F replicas can fail at the same time
    /\ vReplicaStatus' = [ vReplicaStatus EXCEPT ![r] = StRecovering ]
    /\ vLog' = [ vLog EXCEPT ![r] = <<>> ]
    /\ vEarlyBuffer' = [ vEarlyBuffer EXCEPT ![r] = {} ]
    /\ vViewID' = [vViewID EXCEPT![r] = 1 ]
    /\ vLastNormView'   = [ vLastNormView EXCEPT ![r] = 1 ]
    /\ vViewChanges' = [ vViewChanges EXCEPT ![r] = {} ]
    /\ vSyncPoint' = [ vSyncPoint EXCEPT ![r] = 0 ]
    /\ vLateBuffer' = [ vLateBuffer EXCEPT ![r] = {} ]
    /\ vTentativeSync' = [ vTentativeSync EXCEPT ![r] = 0 ]
    /\ vSyncReps' = [ vSyncReps EXCEPT ![r] = {} ]
    /\ vCommitPoint' = [ vCommitPoint EXCEPT ![r] = 0 ]
    /\ vCrashVector' = [ vCrashVector EXCEPT ![r] =  [ rr \in Replicas |-> 0] ]
    /\ vCrashVectorReps' = [ vCrashVectorReps EXCEPT ![r] = {} ]
    /\ vRecoveryReps' = [ vRecoveryReps EXCEPT ![r] = {} ]
    /\ UNCHANGED << vReplicaClock, vUUIDCounter, clientVars, networkVars >>


\* Recovering replica starts recovery (by first sending CrashVectorReq)
StartReplicaRecovery(r) ==
    /\ vReplicaStatus[r] = StRecovering
    /\ vUUIDCounter' = [ vUUIDCounter EXCEPT ![r] = vUUIDCounter[r] + 1 ]
    /\ Send({[ mtype  |-> MCrashVectorReq,
               sender |-> r,
               dest   |-> d,
               nonce  |-> vUUIDCounter'[r] ] : d \in Replicas})
    /\ UNCHANGED << vLog, vEarlyBuffer, vViewID, vReplicaClock,
                    vLastNormView, vViewChanges,vReplicaStatus,
                    vSyncPoint, vLateBuffer,
                    vTentativeSync, vSyncReps, vCommitPoint, 
                    vCrashVector, vCrashVectorReps, vRecoveryReps,
                    clientVars  >>
                       
                       
HandleCrashVectorReq(r, m) ==
    /\ vReplicaStatus[r] = StNormal
    /\ Send({[ mtype  |-> MCrashVectorRep,
               sender |-> r,
               dest   |-> m.sender,
               nonce  |-> m.nonce,
               cv     |-> vCrashVector[r] ]})
    /\ UNCHANGED << replicaVars,  clientVars >>
    

HandleCrashVectorRep(r, m) ==
    /\ vReplicaStatus[r] = StRecovering
    /\ vUUIDCounter[r] = m.nonce
    /\ Cardinality(vCrashVectorReps[r]) <= F
    /\ ~DuplicateRep(vCrashVectorReps[r],m)
    /\ vCrashVectorReps' = [ vCrashVectorReps EXCEPT ![r] = vCrashVectorReps[r] \cup {m} ]
    /\ vCrashVector' = [ vCrashVector EXCEPT ![r] = MergeCrashVector(vCrashVector[r], m.cv) ] 
    /\ IF Cardinality(vCrashVectorReps') = F + 1 THEN  \* got enough replies and can settle down cv
        Send({[ mtype  |-> MRecoveryReq,
                sender |-> r,
                dest   |-> d,
                nonce  |-> m.nonce,
                cv     |-> vCrashVector'[r] ]: d \in Replicas })
       ELSE
        UNCHANGED << networkVars >>

    /\ UNCHANGED <<vLog, vEarlyBuffer, vViewID, vReplicaClock,
                    vLastNormView, vViewChanges,vReplicaStatus,
                    vSyncPoint, vLateBuffer,
                    vTentativeSync, vSyncReps, vCommitPoint, 
                    vUUIDCounter, vRecoveryReps,
                    clientVars >>


HandleRecoveryReq(r, m) == 
    /\ vReplicaStatus[r] = StNormal
    /\ vCrashVector' = [ vCrashVector EXCEPT ![r] = MergeCrashVector(vCrashVector[r], m.cv) ] 
    /\ Send({[  mtype  |-> MRecoveryRep,
                sender |-> r,
                dest   |-> m.sender,
                viewID |-> vViewID[r],
                cv     |-> vCrashVector'[r] ]: d \in Replicas })

    /\ UNCHANGED  << vLog, vEarlyBuffer, vViewID, vReplicaClock,
                    vLastNormView, vViewChanges,vReplicaStatus,
                    vSyncPoint, vLateBuffer,
                    vTentativeSync, vSyncReps, vCommitPoint, 
                    vUUIDCounter, vCrashVectorReps, vRecoveryReps,
                    clientVars   >>


HandleRecoveryRep(r, m) ==
    /\ vReplicaStatus[r] = StRecovering
    /\ Cardinality(vRecoveryReps[r]) <= F
    /\ ~DuplicateRep(vRecoveryReps[r], m.sender)
    /\ CheckCrashVector(m, r)
(* `~
    /\ vRecoveryReps' = [ vRecoveryReps EXCEPT 
                          ![r] = vRecoveryReps[r] \cup {m}  ]
~'
*)
\* Note: After crash vector is updated, those previously accepted messages may also become stray message.
\* Those messages should also be filtered out.
    /\ vRecoveryReps' = [ vRecoveryReps EXCEPT 
                          ![r] = FilterStrayMessage(vRecoveryReps[r] \cup {m}, vCrashVector'[r] )  ]
           
    /\ IF Cardinality(vRecoveryReps') = F + 1 THEN  \* got enough replies
        LET 
            newView == Max({ mm.viewID : mm \in vRecoveryReps'[r] })
            leaderId == newView % Cardinality(Replicas)
        IN 
            Send({[ mtype  |-> MStateTransferReq,
                    sender |-> r,
                    dest   |-> leaderId,
                    cv     |-> vCrashVector'[r] ]: d \in Replicas })
       ELSE
        UNCHANGED << networkVars >>

    /\ UNCHANGED <<vLog, vEarlyBuffer, vViewID, vReplicaClock,
                    vLastNormView, vViewChanges,vReplicaStatus,
                    vSyncPoint, vLateBuffer,
                    vTentativeSync, vSyncReps, vCommitPoint, 
                    vUUIDCounter, vCrashVectorReps,
                    clientVars >>


HandleStateTransferReq(r, m) == 
    /\ vReplicaStatus[r] = StNormal
    /\ CheckCrashVector(m, r)
    /\ Send({[  mtype  |-> MStateTransferRep,
                sender |-> r,
                dest   |-> m.sender,
                log    |-> vLog[r],
                sp     |-> vSyncPoint[r],
                cp     |-> vCommitPoint[r],
                cv     |-> vCrashVector'[r] ]})
    /\ UNCHANGED  << vLog, vEarlyBuffer, vViewID, vReplicaClock,
                    vLastNormView, vViewChanges,vReplicaStatus,
                    vSyncPoint, vLateBuffer,
                    vTentativeSync, vSyncReps, vCommitPoint, 
                    vUUIDCounter, vCrashVectorReps, vRecoveryReps,
                    clientVars >>


HandleStateTransferRep(r, m) == 
    /\ vReplicaStatus[r] = StRecovering
    /\ CheckCrashVector(m, r)
    /\ vLog' = [ vLog EXCEPT ![r] = m.log ]
    /\ vSyncPoint' = [ vSyncPoint EXCEPT ![r] = m.sp ]
    /\ vCommitPoint' = [ vCommitPoint EXCEPT ![r] = m.cp ]
    /\ vViewID' = [ vViewID EXCEPT  ![r] = m.viewID ]
    /\ vEarlyBuffer' = [ vEarlyBuffer EXCEPT ![r] = {} ] 
    /\ vLastNormView' = [ vLastNormView EXCEPT ![r] = m.viewID ]
    /\ vViewChanges' = [vViewChanges EXCEPT ![r] = {} ]
    /\ vReplicaStatus' = [ vReplicaStatus EXCEPT ![r] = StNormal ]
    /\ vLateBuffer' = [ vLateBuffer EXCEPT ![r] = {} ]
    /\ vTentativeSync' = [ vTentativeSync EXCEPT ![r] = m.sp ]
    /\ vSyncReps' = [ vSyncReps EXCEPT ![r] = {} ]
    /\ vCrashVectorReps' = [ vCrashVectorReps EXCEPT ![r] = {} ]
    /\ vRecoveryReps' = [ vRecoveryReps EXCEPT ![r]= {} ]
    /\ UNCHANGED  << vReplicaClock, vUUIDCounter, clientVars >>


--------------------------------------------------------------------------------
\* `^\textbf{\large Leader Change}^'

\* Replica r starts a Leader change
StartLeaderChange(r) ==
  /\ Send({[ mtype  |-> MViewChangeReq,
             sender |-> r,
             dest   |-> d,
             viewID |-> vViewID[r] + 1,
             cv     |-> vCrashVector[r] ] : d \in Replicas})
  /\ UNCHANGED << replicaVars, clientVars >>
  
  
\* `^\textbf{View Change Handlers}^'
\* Replica r gets MViewChangeReq, m
HandleViewChangeReq(r, m) ==
  LET
    currentViewID == vViewID[r]
    newViewID     == Max({currentViewID, m.viewID})
    newLeaderNum  == LeaderID(newViewID)
  IN
  \* Recovering replica does not participate in view change
  /\ vReplicaStatus[r] /= StRecovering
  /\ currentViewID   /= newViewID
  /\ CheckCrashVector(m, r)
  /\ vReplicaStatus' = [ vReplicaStatus EXCEPT ![r] = StViewChange ]
  /\ vViewID'        = [ vViewID EXCEPT ![r] = newViewID ]
  /\ vViewChanges'   = [ vViewChanges EXCEPT ![r] = {} ]
  /\ Send({[ mtype      |-> MViewChange,
             dest       |-> Leader(newViewID),
             sender     |-> r,
             viewID     |-> newViewID,
             lastNormal |-> vLastNormView[r],
             syncedLog  |-> SubSeq(vLog[r], 1, vSyncPoint[r]),
             unsyncedLog|-> SubSeq(vLog[r], vSyncPoint[r]+1, Len(vLog[r])),
             cv         |-> vCrashVector[r] ]} \cup
           \* Send the MViewChangeReqs in case this is an entirely new view
           {[ mtype  |-> MViewChangeReq,
              sender |-> r,
              dest   |-> d,
              viewID |-> newViewID,
              cv     |-> vCrashVector[r] ] : d \in Replicas})
  /\ UNCHANGED << clientVars, vLog, vEarlyBuffer, vReplicaClock,
                  vLastNormView, vSyncPoint, vLateBuffer, 
                  vTentativeSync, vSyncReps, vCommitPoint,
                  vUUIDCounter, vCrashVectorReps, vRecoveryReps >>

                       
\* Replica r receives MViewChange, m
HandleViewChange(r, m) ==
  \* Recovering replica does not participate in view change
  /\ vReplicaStatus[r] /= StRecovering
  \* Add the message to the log
  /\ vViewID[r]         = m.viewID
  /\ vReplicaStatus[r]  = StViewChange
  \* This replica is the leader
  /\ Leader(vViewID[r]) = r
  /\ CheckCrashVector(m, r)
(* `~
  /\ vViewChanges' = [ vViewChanges EXCEPT ![r] = vViewChanges[r] \cup {m}]
~'
*)
  \* Note: Similar to vRecoveryReps, (potential) stray messages should be filtered out.
  /\ vViewChanges' = [ vViewChanges EXCEPT 
                       ![r] = FilterStrayMessage(vViewChanges[r] \cup {m}, vCrashVector'[r]) ]
  \* If there's enough replies, start the new view
  /\ LET
       isViewPromise(M) == /\ { n.sender : n \in M } \in Quorums
                           /\ \E n \in M : n.sender = r
       vCMs             == { n \in vViewChanges'[r] :
                               /\ n.mtype  = MViewChange
                               /\ n.viewID = vViewID[r] }
       \* Create the state for the new view
       normalViews  == { n.lastNormal : n \in vCMs }
       \* Choose the largest normal view (i.e. the newest)
       lastNormal     == (CHOOSE v \in normalViews : \A v2 \in normalViews : v2 <= v)
       \* For logs before vSyncPoint (i.e. syncedLog), we directly copy from the bestCandiates
       \* For unsyncedLog, we do quorum check to decide which ones should be added to recovery Log
       goodCandidates ==  { o \in vCMs : o.lastNormal = lastNormal }
       \* bestCandidate can only be picked from goodCandidates, 
       \* because previous views may include invalid logs
       bestCandidate  == CHOOSE n \in goodCandidates: 
                            \A y \in goodCandidates: Len(n.syncedLog) >= Len(y.syncedLog)
       unSyncedLogs   == { Seq2Set(n.unsyncedLog) : n \in goodCandidates }

     IN
       IF isViewPromise(vCMs) THEN
         Send({[ mtype      |-> MStartView,
                 dest       |-> d,
                 viewID     |-> vViewID[r],
                 log        |-> bestCandidate.syncedLog 
                                \o MergeUnSyncLogs(unSyncedLogs, LastLog(bestCandidate.syncedLog))
               ] : d \in Replicas })
       ELSE
         UNCHANGED networkVars
  /\ UNCHANGED << clientVars,  vLog,  vEarlyBuffer, vViewID, vReplicaClock, 
                  vLastNormView, vReplicaStatus, vSyncPoint, vLateBuffer,
                  vTentativeSync, vSyncReps,vCommitPoint, 
                  vUUIDCounter, vCrashVectorReps, vRecoveryReps >>


\* Replica r receives a MStartView, m
HandleStartView(r, m) ==
  /\ vReplicaStatus[r] /= StRecovering
  /\ \/ vViewID[r]   < m.viewID
     \/ vViewID[r]   = m.viewID /\ vReplicaStatus[r] = StViewChange
  /\ CheckCrashVector(m, r)
  /\ vLog'           = [ vLog EXCEPT ![r] = m.log ]
  /\ vReplicaStatus' = [ vReplicaStatus EXCEPT ![r] = StNormal ]
  /\ vViewID'        = [ vViewID EXCEPT ![r] = m.viewID ]
  /\ vLastNormView'  = [ vLastNormView EXCEPT ![r] = m.viewID ]
  /\ vEarlyBuffer' = [ vEarlyBuffer EXCEPT ![r] = {} ] \* clear Early Buffer for the new view
  /\ vLateBuffer' = [ vLateBuffer EXCEPT ![r] = {}] \* clear Late Buffer for the new view
  /\ vSyncPoint' = [ vSyncPoint EXCEPT![r] = Len(m.log) ]  
  /\ vTentativeSync' = [ vTentativeSync EXCEPT![r] = Len(m.log) ]
  \* Send replies (in the new view) for all log items
  /\ IF r = Leader(m.viewID) THEN   \* Leader only sends fast reply
        Send({[  mtype      |-> MFastReply,
                 sender     |-> r,
                 dest       |-> m.log[i].clientID,
                 viewID     |-> m.viewID,
                 requestID  |-> m.log[i].requestID,
                 hash       |-> [
                                    log |-> SubSeq(m.log, 1, i),
                                    cv  |-> vCrashVector
                                ],
                 deadline   |-> m.log[i].deadline,
                 logSlotNum |-> i ] : i \in (1..Len(m.log))})
     ELSE \* While staring view, followers knows the log is synced with the leader, so send slow-reply
        Send({[  mtype      |-> MSlowReply,
                 sender     |-> r,
                 dest       |-> m.log[i].clientID,
                 viewID     |-> m.viewID,
                 requestID  |-> m.log[i].requestID,
                 logSlotNum |-> i ] : i \in (1..Len(m.log))})
  /\ UNCHANGED << clientVars, vReplicaClock, vViewChanges, 
                  vSyncReps, vCommitPoint, vCrashVector,
                  vUUIDCounter, vCrashVectorReps, vRecoveryReps >> 
                       
--------------------------------------------------------------------------------
\* `^\textbf{\large Periodic Synchronization}^'
\* Leader replica r conduct synchronization periodically
\* This periodic sync process is different from index sync process
\* It ensures that all replicas’ logs are stable up to their CommitPoint (for fast recovery)
\* Our CommitPoint is essentially the `^\emph{sync-point}^' defined in NOPaxos paper 
\* Just as mentioned in NOPaxos paper, it is an optional optimization for fast recovery
\* Nezha still works even without this part
StartSync(r) ==
  /\ Leader(vViewID[r]) = r
  /\ vReplicaStatus[r]  = StNormal
  /\ vTentativeSync[r] < Len(vLog[r])  \* If >= then no need to sync
  /\ vSyncReps'         = [ vSyncReps EXCEPT ![r] = {} ]
  /\ vTentativeSync'    = [ vTentativeSync EXCEPT ![r] = Len(vLog[r]) ]
  /\ Send({[ mtype      |-> MSyncPrepare,
             sender     |-> r,
             dest       |-> d,
             viewID     |-> vViewID[r],
             log        |-> vLog[r] ] : d \in Replicas })
  /\ UNCHANGED << clientVars, vLog, vEarlyBuffer, vViewID, vReplicaClock, 
                  vLastNormView, vViewChanges, vReplicaStatus,
                  vSyncPoint, vLateBuffer, vCommitPoint,
                  vUUIDCounter, vCrashVector, 
                  vCrashVectorReps, vRecoveryReps >>

                       
\* Replica r receives MSyncPrepare, m
HandleSyncPrepare(r, m) ==
  LET
    newLog == m.log \o GetUnSyncLogs(vLog[r], LastLog(m.log) )
  IN
  /\ vReplicaStatus[r] = StNormal
  /\ m.viewID          = vViewID[r]
  /\ m.sender          = Leader(vViewID[r])
  /\ IF     vSyncPoint[r]  < Len(m.log) THEN
            /\ vSyncPoint' = [vSyncPoint EXCEPT ![r] = Len(m.log)]
            /\ vLog'       = [ vLog EXCEPT ![r] = newLog ]
            /\ Send({[   mtype      |-> MSlowReply,
                         sender     |-> r,
                         dest       |-> m.log[i].clientID,
                         viewID     |-> m.viewID,
                         requestID  |-> m.log[i].requestID,
                         logSlotNum |-> i ] : i \in (1..Len(m.log))})
     ELSE
            UNCHANGED <<vLog, vSyncPoint >>
  /\ Send({[ mtype         |-> MSyncRep,
             sender        |-> r,
             dest          |-> m.sender,
             viewID        |-> vViewID[r],
             logSlotNumber |-> Len(m.log) ]}
          )
  /\ UNCHANGED <<clientVars, vEarlyBuffer, vViewID,  vReplicaClock,
                 vLastNormView, vViewChanges, vReplicaStatus, 
                 vLateBuffer, vTentativeSync, vSyncReps, vCommitPoint,
                 vUUIDCounter, vCrashVector, 
                 vCrashVectorReps, vRecoveryReps>>

                       
\* Replica r receives MSyncRep, m
HandleSyncRep(r, m) ==
  /\ m.viewID          = vViewID[r]
  /\ vReplicaStatus[r] = StNormal
  /\ vSyncReps'        = [ vSyncReps EXCEPT ![r] = vSyncReps[r] \cup { m } ]
  /\ LET isViewPromise(M) == /\ { n.sender : n \in M } \in Quorums
                             /\ \E n \in M : n.sender = r
         sRMs             == { n \in vSyncReps'[r] :
                                 /\ n.mtype         = MSyncRep
                                 /\ n.viewID        = vViewID[r]
                                 /\ n.logSlotNumber = vTentativeSync[r] }
         committedLog     == IF vTentativeSync[r] >= 1 THEN
                               SubSeq(vLog[r], 1, vTentativeSync[r])
                             ELSE
                               << >>
     IN
       IF isViewPromise(sRMs) THEN
         /\ Send({[ mtype         |-> MSyncCommit,
                    sender        |-> r,
                    dest          |-> d,
                    viewID        |-> vViewID[r],
                    log           |-> committedLog] :
                    d \in Replicas })
         /\ vCommitPoint' =  [ vCommitPoint EXCEPT ![r] = vTentativeSync[r] ]
       ELSE
         UNCHANGED << networkVars, vCommitPoint >>
  /\ UNCHANGED  << clientVars, vLog, vEarlyBuffer, vViewID,
                   vReplicaClock, vLastNormView, vViewChanges,  
                   vReplicaStatus, vSyncPoint, vLateBuffer, 
                   vTentativeSync, vUUIDCounter, vCrashVector, 
                   vCrashVectorReps, vRecoveryReps >>


\* Replica r receives MSyncCommit, m
HandleSyncCommit(r, m) ==
  LET
    newLog == m.log \o GetUnSyncLogs(vLog[r], LastLog(m.log) )
  IN
  /\ vReplicaStatus[r] = StNormal
  /\ m.viewID          = vViewID[r]
  /\ m.sender          = Leader(vViewID[r])
  /\ IF  Len(m.log) <=  vCommitPoint[r] THEN
        UNCHANGED <<vCommitPoint, vLog>>
     ELSE
        /\ vLog'        = [ vLog EXCEPT ![r] = newLog ]
        /\ vCommitPoint'  = [ vCommitPoint  EXCEPT ![r] = Len(m.log) ]
        /\ Send({[ mtype      |-> MSlowReply,
                   sender     |-> r,
                   dest       |-> m.log[i].clientID,
                   viewID     |-> m.viewID,
                   requestID  |-> m.log[i].requestID,
                   logSlotNum |-> i ] : i \in (1..Len(m.log))})
  /\ UNCHANGED << networkVars, clientVars,  vEarlyBuffer, 
                  vViewID,  vReplicaClock,  vLastNormView, vViewChanges, 
                  vReplicaStatus, vSyncPoint, vLateBuffer, 
                  vTentativeSync, vSyncReps, 
                  vUUIDCounter, vCrashVector, 
                  vCrashVectorReps, vRecoveryReps >>

--------------------------------------------------------------------------------
(* `^\textbf{\large Invariants and Helper Functions}^' *)
    
(*
  A request/log is committed in two possible cases:
  (1) A fast quorum has sent either slow-reply messages, or fast-reply messages with consistent hashes [Fast Path]
  (2) A simple quorum has sent slow-reply messages [Slow Path]
  Both quorums should include the leader
*)

\* Check whether log <clientID, requestID> is committed at position logSlotNum
Committed(clientID, requestID, logSlotNum) ==
    \* Fast path
    \/ \E M \in SUBSET ({m \in messages : /\ \/ m.mtype = MFastReply
                                             \/ m.mtype = MSlowReply 
                                          /\ m.logSlotNum = logSlotNum
                                          /\ m.dest = clientID 
                                          /\ m.requestID = requestID }) :
        \* Sent from a fast quorum
        /\ { m.sender : m \in M } \in FastQuorums
        \* Matching view-id
        /\ \E m1 \in M : \A m2 \in M : m1.viewID = m2.viewID
        \* One from the leader
        /\ \E m \in M : m.sender = Leader(m.viewID)
        \* Hash values are consistent
        /\  LET 
                leaderReply == CHOOSE m \in M : m.sender = Leader(m.viewID)
            IN
            \A m1 \in M : IF m1.mtype = MFastReply THEN 
                             m1.hash = leaderReply.hash 
                          ELSE 
                             TRUE  \* SlowReply has consistent hash for sure
    \* Slow path
    \/ \E M \in SUBSET ({m \in messages : /\ \/ m.mtype = MSlowReply
                                             \/ /\ m.mtype = MFastReply  \* Leader only sends fast-reply
                                                /\ m.sender =Leader(m.viewID)
                                          /\ m.logSlotNum = logSlotNum
                                          /\ m.dest = clientID
                                          /\ m.requestID = requestID }) : 
        /\ { m.sender : m \in M } \in Quorums
        \* Matching view-id
        /\ \E m1 \in M : \A m2 \in M : m1.viewID = m2.viewID
        \* One from the leader
        /\ \E m \in M : m.sender = Leader(m.viewID)
 
 
 \* Check whether log <clientID, requestID> is committed in view viewID
 CommittedInView(clientID, requestID, viewID) ==
    \* Fast path
    \/ \E M \in SUBSET ({m \in messages : /\ \/ m.mtype = MFastReply
                                             \/ m.mtype = MSlowReply 
                                          /\ m.dest = clientID
                                          /\ m.requestID = requestID
                                          /\ m.viewID = viewID}) :
        \* Sent from a fast quorum
        /\ { m.sender : m \in M } \in FastQuorums
        \* One from the leader
        /\ \E m \in M : m.sender = Leader(m.viewID)
        \* Hash values are the same
        /\  LET 
                leaderReply == CHOOSE m \in M : m.sender = Leader(m.viewID)
            IN
            \A m1 \in M : IF m1.mtype = MFastReply THEN 
                             m1.hash = leaderReply.hash 
                          ELSE 
                             TRUE  \* SlowReply has consistent hash for sure
    \* Slow path
    \/ \E M \in SUBSET ({m \in messages : /\ \/ m.mtype = MSlowReply
                                             \/ /\ m.mtype = MFastReply  \* Leader only sends fast-reply
                                                /\ m.sender = Leader(m.viewID)
                                          /\ m.dest = clientID
                                          /\ m.requestID = requestID
                                          /\ m.viewID = viewID}) : 
        /\ { m.sender : m \in M } \in Quorums
        \* Hash values are the same
        /\ \E m1 \in M : \A m2 \in M : m1.hash = m2.hash
        \* One from the leader
        /\ \E m \in M : m.sender = Leader(m.viewID)
              
                
SystemRecovered(viewID) ==  /\ \E RM \in SUBSET(Replicas): 
                               /\ Cardinality(RM) >= QuorumSize
                               /\ \A r \in RM: vLastNormView[r] >= viewID
                               /\ \A r \in RM: vReplicaStatus[r] = StNormal \* These replicas must be normal
                            \* The leader of this view has also recovered or even goes beyond this view 
                            /\  vLastNormView[Leader(viewID)] >= viewID

(* `^\textbf{Invariants}^' *)
\* Durability: Committed Requests always survive failure
\* i.e. If a request is committed in one view, then it will remain committed in the higher views
\* One thing to note, the check of "committed" only happens when the system is still "normal"
\* While the system is under recovery (i.e. less than f+1 replicas are normal), 
\* the check of committed does not make sense
Durability == \A v1, v2 \in 1..MaxViews:
                \* If a request is committed in lower view (v1,), 
                \* it is impossible to make this request uncommited in higher view (v2)
                   ~(/\ v1 < v2 
                     \* To check Durability of request in higher views, 
                     \* the system should have entered the higher views
                     /\ SystemRecovered(v2)
                     /\ \E c \in Clients :
                        \E r \in 1..MaxReqNum:
                            /\ CommittedInView(c,r, v1)
                            /\ ~CommittedInView(c,r, v2))

\* Consistency: Committed requests have the same history even after view changes
\* i.e. If a request is committed in a lower view (v1), then (based on Durability Property)
\* it remains committed in higher view (v2)
\* Consistency requires the history of the request (i.e. all the request before this request) remain the same                         
Consistency == 
     \A v1, v2 \in 1..MaxViews:   
              ~(/\ v1 < v2
                \* To check Consistency of request in higher views, 
                \* the system should have entered the higher views
                /\ SystemRecovered(v2) 
                /\ \E c \in Clients :
                   \E r \in 1..MaxReqNum:
                   \E t \in 1..MaxTime:
                     \* Durability has been checked in another invariant
                     /\ CommittedInView(c,r, v1)
                     /\ CommittedInView(c,r, v2)
                     /\ LET 
                            v1LeaderReply == CHOOSE m \in messages: 
                                                /\ m.mtype = MFastReply
                                                /\ m.deadline = t
                                                /\ m.dest = c 
                                                /\ m.requestID = r
                                                /\ m.viewID = v1
                                                /\ m.sender = Leader(v1) 
                            v2LeaderReply == CHOOSE m \in messages: 
                                                /\ m.mtype = MFastReply
                                                /\ m.deadline = t
                                                /\ m.dest = c 
                                                /\ m.requestID = r
                                                /\ m.viewID = v2
                                                /\ m.sender = Leader(v2)                                                                    
                        IN
                           v1LeaderReply.hash /= v2LeaderReply.hash)  
                            
\* Linearizability: Only one request can be committed for a given position
\* i.e. If one request has committed at position i, then no contrary observation can be made
\* i.e. there cannot be a second request committed at the same position
Linearizability ==
  LET
    maxLogPosition == Max({1} \cup
      { m.logSlotNum : m \in {m \in messages : 
                          \/ m.mtype = MFastReply
                          \/ m.mtype = MSlowReply } })
  IN ~(\E c1, c2 \in Clients :
       \E r1, r2 \in 1..MaxReqNum:
         /\ << c1, r1 >> /= << c2, r2 >>
         /\ \E i \in (1 .. maxLogPosition) :
            /\ Committed(c1, r1, i)
            /\ Committed(c2, r2, i)
      )

(* `~
SyncSafety == \A r \in Replicas :
              \A i \in 1..vSyncPoint[r] :
              IF SystemRecovered(vViewID[r]) THEN
                \* Committed can only be checked when the system is recovered 
                \* (i.e. when there are f+1 replicas alive)
                Committed(vLog[r][i].ta,vLog[r][i].clientID, vLog[r][i].reqID, i)
              ELSE
                TRUE
 ~'
 *)               
--------------------------------------------------------------------------------
(* `^\textbf{\large Main Transition Function}^' *)

Next == \* Handle Messages
    \/ \E m \in messages : 
                        /\ m.mtype = MClientRequest
                        /\ m \notin vReplicaProcessed[m.dest]
                        /\ HandleClientRequest(m.dest, m)
                        /\ vReplicaProcessed' = 
                            [vReplicaProcessed EXCEPT ![m.dest] = 
                            vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
                        /\ UNCHANGED vClientProcessed
                        /\ DebugAction' = << "HandleClientRequest", m >>
                            
    \/ \E m \in messages : 
                        /\ m.mtype = MViewChangeReq
                        /\ m \notin vReplicaProcessed[m.dest]
                        /\ HandleViewChangeReq(m.dest, m)  
                        /\ vReplicaProcessed' = 
                            [vReplicaProcessed EXCEPT ![m.dest] = 
                            vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
                        /\ UNCHANGED vClientProcessed
                        /\ DebugAction' = << "HandleViewChangeReq", m >>
                                                                
    \/ \E m \in messages : 
                        /\ m.mtype = MViewChange
                        /\ m \notin vReplicaProcessed[m.dest]
                        /\ HandleViewChange(m.dest, m)
                        /\ vReplicaProcessed' = 
                            [vReplicaProcessed EXCEPT ![m.dest] = 
                            vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
                        /\ UNCHANGED vClientProcessed
                        /\ DebugAction' = << "HandleViewChange", m >>
                            
    \/ \E m \in messages : 
                        /\ m.mtype = MStartView
                        /\ m \notin vReplicaProcessed[m.dest]
                        /\ HandleStartView(m.dest, m)
                        /\ vReplicaProcessed' = 
                            [vReplicaProcessed EXCEPT ![m.dest] = 
                            vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
                        /\ UNCHANGED vClientProcessed
                        /\ DebugAction' = << "HandleStartView", m >>
    
    \/ \E m \in messages : 
                        /\ m.mtype = MSyncPrepare
                        /\ m \notin vReplicaProcessed[m.dest]
                        /\ HandleSyncPrepare(m.dest, m)
                        /\ vReplicaProcessed' = 
                            [vReplicaProcessed EXCEPT ![m.dest] =
                                vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
                        /\ UNCHANGED vClientProcessed
                        /\ DebugAction' = << "HandleSyncPrepare", m >>
                            
    \/ \E m \in messages :
                        /\ m.mtype = MSyncRep
                        /\ m \notin vReplicaProcessed[m.dest]
                        /\ HandleSyncRep(m.dest, m)
                        /\ vReplicaProcessed' = 
                            [vReplicaProcessed EXCEPT ![m.dest] = 
                            vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
                        /\ UNCHANGED vClientProcessed
                        /\ DebugAction' = << "HandleSyncRep", m >>
    \/ \E m \in messages :
                        /\ m.mtype = MSyncCommit
                        /\ m \notin vReplicaProcessed[m.dest]
                        /\ HandleSyncCommit(m.dest, m)
                        /\ vReplicaProcessed' = 
                            [vReplicaProcessed EXCEPT ![m.dest] = 
                            vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
                        /\ UNCHANGED vClientProcessed
                        /\ DebugAction' = << "HandleSyncCommit", m >>
                            
    \/ \E m \in messages:
                        /\ m.mtype = MMissEntryRequest
                        /\ m \notin vReplicaProcessed[m.dest]
                        /\ HandleMissEntryRequest(m.dest, m)
                        /\ vReplicaProcessed' = 
                            [vReplicaProcessed EXCEPT ![m.dest] = 
                             vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
                        /\ UNCHANGED vClientProcessed
                        /\ DebugAction' = << "HandleMissEntryRequest", m >>
                
    \/ \E m \in messages:
                        /\ m.mtype = MMissEntryReply
                        /\ m \notin vReplicaProcessed[m.dest]
                        /\ HandleMissEntryReply(m.dest, m)
                        /\ vReplicaProcessed' = 
                            [vReplicaProcessed EXCEPT ![m.dest] = 
                             vReplicaProcessed[m.dest] \cup { Msg2RLog(m, m.dest) } ]
                        /\ UNCHANGED vClientProcessed
                        /\ DebugAction' = << "HandleMissEntryReply", m >>
                          
    \* Client Actions
    \/ \E c \in Clients :  
                        /\ vClientReqNum[c] < MaxReqNum
                        /\ ClientSendRequest(c)
                        /\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
                        /\ DebugAction' = << "ClientSendRequest", "" >>                              
                                                                
    \* Start Synchronization
    \/ \E r \in Replicas :  
                        /\ StartSync(r)
                        /\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
                        /\ DebugAction' = << "StartSync", "" >>                
    
    \* Replica Fail
    \/ \E r \in Replicas :
                        /\ vReplicaStatus[r] = StNormal
                        /\ StartReplicaFail(r)
                        /\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
                        /\ DebugAction' = << "StartReplicaFail", "" >>
    
    
    \* Leader Change
    \/ \E r \in Replicas : 
                        /\ vViewID[r] < MaxViews
                        /\ StartLeaderChange(r)
                        /\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
                        /\ DebugAction' = << "StartLeaderChange", "" >>
                        
    \* Replica Rejoin                    
    \/ \E r \in Replicas :
                        /\ vReplicaStatus[r] = StRecovering
                        /\ StartReplicaRecovery(r)
                        /\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
                        /\ DebugAction' = << "StartReplicaRecovery", "" >>
    
    \* Replica Actions:
    \/ \E r \in Replicas:
                        /\ StartIndexSync(r)
                        /\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
                        /\ DebugAction' = << "StartIndexSync", "" >>
                        
    \/ \E r \in Replicas:
                    /\ FlushEarlyBuffer(r)
                    /\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
                    /\ DebugAction' = << "FlushReplicaBuffer", "" >>
    \* Clock Move
    \/ \E r \in Replicas : 
                        /\ ReplicaClockMove(r)                             
                        /\ UNCHANGED << vReplicaProcessed, vClientProcessed >>
                        /\ DebugAction' = << "ReplicaClockMove", "" >>
    
    \/ \E c \in Clients  : 
                        /\ ClientClockMove(c)
                        /\ UNCHANGED << vReplicaProcessed, vClientProcessed >>                          
                        /\ DebugAction' = << "ClientClockMove", "" >>

                    
================================================================================


================================================
FILE: docs/demo.md
================================================

## One-Box Demo
We have prepared the configuration files in ```configs``` folder, these configuration files will be used to launch 3 replicas, 1 proxy and 1 client. Under ```configs``` folder, we have ```local``` folder (for the single-machine test), containing: 

- nezha-replica-config-0.yaml 
- nezha-replica-config-1.yaml
- nezha-replica-config-2.yaml
- nezha-proxy-config.yaml
- nezha-client-config.yaml

When running distributed tests, the user can refer to the template files (e.g., ```configs/nezha-replica-config-template.yaml```) to generate their customized config files (such as configuring the IP addresses in the config files). 

Before running the experiment, we assume the user has generated and copied their configuration files into the ```$HOME/Nezha/configs``` folder.

### View Change Test
**Step 1**: Launch 3 replicas (i.e. replica-0, replica-1, replica-2). Open 3 terminals and launch one replica in each terminal.

```
# In the first terminal (replica-0)
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-0.yaml

# In the second terminal (replica-1)
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-1.yaml

# In the third terminal (replica-2)
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-2.yaml

```

![Step-1](figs/nezha-vr-test-figs/step-1.png)

**Step 2**: After the three replicas are launched, we can see the important information displayed from the console logs, e.g. the current view, the replica id of this replica, the number of replicas, the number of keys the maintained by each replica's state machine (for commutativity optimization)
![Step-2](figs/nezha-vr-test-figs/step-2.png)

**Step 3**: In view 0, the leader replica is ```viewId%replicaNum=0```, i.e. replica-0. Therefore, if we kill replica-0, we will trigger view change, so we use Crtl+C to kill replica-0
![Step-3](figs/nezha-vr-test-figs/step-3.png)


**Step 4**: After leader is killed, the remaining 2 replicas start view change to enter a new view, i.e., view 1. In this new view, the leader becomes ```viewId%replicaNum=1```, i.e., replica-1. Since there are still a majority of replicas (i.e., 2 replicas) alive, the system can resume service.
![Step-4](figs/nezha-vr-test-figs/step-4.png)

**Step 5**: We want the failed replica to rejoin the system. Therefore, we launch replica-0. This time, we set the flag ```isRecovering``` as true, so that it goes through the recovery procedure and retrieves the state from the other healthy replicas.
```
# In the first terminal 
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-0.yaml --isRecovering true
```

![Step-5](figs/nezha-vr-test-figs/step-5.png)

**Step 6**: We can see that replica-0 rejoins the system as a follower, and the current view is 1.
![Step-6](figs/nezha-vr-test-figs/step-6.png)


The test process can be repeated. So long as there are always a majority of replicas (f+1) remaining, then the system is able to serve clients and failed replicas can also rejoin. 

### Test with Client

**Step 0**: Kill all the processes launched in the previous section.

**Step 1**: Similar to the previous section, we launch 3 replicas. More than that, this time we also launch 1 proxy and 1 client. In the client configuration file (i.e. [nezha-client-config.yaml](configs/nezha-client-config.yaml) ), we have specified the client as an open-loop client, and it will submit at about 1000 requests/second. This time we need to open 5 terminals in total.
```
is-openloop: true
poisson-rate: 10
```

```
# In the first terminal (replica-0)
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-0.yaml

# In the second terminal (replica-1)
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-1.yaml

# In the third terminal (replica-2)
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-2.yaml

# In the fourth terminal (proxy)
$HOME/Nezha/bazel-bin/proxy/nezha_proxy --config $HOME/Nezha/configs/local/nezha-proxy-config.yaml

# In the fifth terminal (client-1)
$HOME/Nezha/bazel-bin/client/nezha_client  --config $HOME/Nezha/configs/nezha-client-config.yaml

```


![Step-1](figs/nezha-test-with-client/step-1.png)


**Step 2**: After the client is launched, we can see it continues to submit requests and the proxy continues to forward requests for the client. For every 5 seconds, the client terminal will print a log to show the stats.
![Step-2](figs/nezha-test-with-client/step-2.png)

**Step 3**: While the client is submitting requests, we kill the leader (i.e. replica-0), we can see that the remaining 2 replicas rapidly complete the view change and get the new leader, which takes about ```1657418951138477-1657418950947251=191226us=191ms```. It can complete the view change so fast, because of the optimization of periodical synchronization (which has been explained in our paper). Because of the periodical synchronization, the new leader replica does not need to do state transfer from scratch, it just needs to do state transfer and log merge from the last commit point. 
![Step-3](figs/nezha-test-with-client/step-3.png)


**Step 4**: We want the crashed replica (i.e. replica-0) to rejoin the system. So we set ```isRecovering``` flag as true. 

```
# In the first terminal
$HOME/Nezha/bazel-bin/replica/nezha_replica --config $HOME/Nezha/configs/nezha-replica-config-0.yaml --isRecovering true
```

![Step-4](figs/nezha-test-with-client/step-4.png)


**Step 5**: The crashed replica starts from an empty state, so it needs to retrieve all the log entries in order to recover. Since we are using UDP and by default only fetch 5 entries during each round, the state transfer can take some time if clients have submitted many entries. As shown in the terminal of replica-0, we also print the progress of the recovery. But note that the follower's recovery does not block the other healthy replicas from serving the client. An optional optimization in consideration is to generate snapshot periodically and dump to stable storage. In this way, when a crashed replica wants to recover, it first fetches the state from local storage, and then does state transfer. In this way, it can save the recovery time. 
![Step-5](figs/nezha-test-with-client/step-5.png)


**Step 6**: After replica-0 retrieves all the state, we can see it successfully recover and work as a follower.
![Step-6](figs/nezha-test-with-client/step-6.png)


================================================
FILE: docs/tla-intro.md
================================================
# Nezha TLA+

This repository includes a model-checked TLA+ specification (both the source file and the pdf version) for Nezha protocol. Besides, we also include a document to explain Nezha's recovery in pseudo-code. 


================================================
FILE: external/gogoprotobuf.BUILD
================================================
package(default_visibility=['//visibility:public'])

proto_library(
    name = "gogo_proto",
    srcs = ["gogoproto/gogo.proto"],
    deps = ["@com_google_protobuf//:descriptor_proto"]
)

================================================
FILE: external/googleapi.BUILD
================================================
package(default_visibility=['//visibility:public'])

proto_library(
name = 'annotations_proto',
srcs = ['google/api/annotations.proto'],
deps = [
        ":http_proto",
        "@com_google_protobuf//:descriptor_proto"
    ],
)

proto_library(
    name = 'http_proto',
    srcs = ['google/api/http.proto']
)

================================================
FILE: lib/BUILD
================================================
load("@rules_proto//proto:defs.bzl", "proto_library")

cc_library(
    name = "zipfian",
    srcs = ["zipfian.h"],
    hdrs = ["zipfian.h"],
    visibility = ["//visibility:public"],
)


cc_library(
    name = "common_type",
    srcs = ["common_type.h"],
    hdrs = ["common_type.h"],
    visibility = ["//visibility:public"],
)


cc_library(
    name = "common_struct",
    srcs = ["common_struct.h"],
    hdrs = ["common_struct.h"],
    visibility = ["//visibility:public"],
    deps = [
        ":common_type",
    ],
)


cc_library(
    name = "address",
    srcs = ["address.cc"],
    hdrs = ["address.h"],
    visibility = ["//visibility:public"],
)

cc_library(
    name = "message_handler",
    srcs = ["message_handler.h"],
    hdrs = ["message_handler.h"],
    visibility = ["//visibility:public"],
    deps = [
        ":address",
        ":common_type",
    ],
)

cc_library(
    name = "timer",
    srcs = ["timer.h"],
    hdrs = ["timer.h"],
    visibility = ["//visibility:public"],
    deps = [
        ":address",
        ":common_type",
    ],
)

cc_library(
    name = "endpoint",
    srcs = ["endpoint.cc"],
    hdrs = ["endpoint.h"],
    visibility = ["//visibility:public"],
    deps = [
        ":address",
        ":common_struct",
        ":message_handler",
        ":timer",
        "@com_github_enki_libev//:libev",
        "@com_github_google_glog//:glog",
        "@com_google_protobuf//:protobuf",
        "@openssl//:openssl",
    ],
)


cc_library(
    name = "udp_socket_endpoint",
    srcs = ["udp_socket_endpoint.cc"],
    hdrs = ["udp_socket_endpoint.h"],
    visibility = ["//visibility:public"],
    deps = [
        ":address",
        ":endpoint",
        "@com_github_enki_libev//:libev",
        "@com_google_protobuf//:protobuf",
        "@openssl//:openssl",
    ],
)


cc_library(
    name = "utils",
    srcs = ["utils.cc"],
    hdrs = ["utils.h"],
    deps = [
        ":udp_socket_endpoint",
        "@com_github_cameron314_concurrentqueue//:concurrentqueue",
        "@com_github_preshing_junction//:libjunction",
        "@com_github_gflags_gflags//:gflags",
        "@com_github_google_glog//:glog",
        "@openssl//:openssl",
    ],
    visibility = ["//visibility:public"],
)


================================================
FILE: lib/Rules.mk
================================================
d := $(dir $(lastword $(MAKEFILE_LIST)))

SRCS += $(addprefix $(d), \
	address.cc utils.cc udp_socket_endpoint.cc)

LIB-address :=  $(o)address.o

LIB-utils := $(o)utils.o

LIB-udp-socket := $(o)udp_socket_endpoint.o $(LIB-address) $(LIB-utils)


$(info LIB-udp-socket is $(LIB-udp-socket)) 

# include $(d)tests/Rules.mk

================================================
FILE: lib/address.cc
================================================
#include "lib/address.h"

Address::Address() : ip_(""), port_(-1), mac_("") {
  bzero(&addr_, sizeof(addr_));
}
Address::Address(const std::string& ip, const int port, const std::string& mac)
    : ip_(ip), port_(port), mac_(mac) {
  bzero(&addr_, sizeof(addr_));
  addr_.sin_family = AF_INET;
  addr_.sin_port = htons(port);
  addr_.sin_addr.s_addr = inet_addr(ip.c_str());
}
Address::~Address() {}

std::string Address::GetIPAsString() {
  ip_ = inet_ntoa(addr_.sin_addr);
  return ip_;
}

int Address::GetPortAsInt() {
  port_ = htons(addr_.sin_port);
  return port_;
}

================================================
FILE: lib/address.h
================================================
#ifndef NEZHA_ADDRESS
#define NEZHA_ADDRESS
#include <arpa/inet.h>
#include <netinet/in.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <unistd.h>
#include <cstring>
#include <string>

#define UDP_BUFFER_SIZE (512)

/**
 * The address of an endpoint is encapsulate as the Address Class.
 * Now it mainly includes the socket-based information, such as ip and port, but
 * we reserves the future possibility to extend to support other communication
 * primitives, such as DPDK
 */

class Address {
 public:
  std::string ip_;
  int port_;
  std::string mac_;  // For future extension (DPDK)
  struct sockaddr_in addr_;

  Address();
  Address(const Address& addr)
      : ip_(addr.ip_), port_(addr.port_), mac_(addr.mac_) {
    memcpy(&addr_, &(addr.addr_), sizeof(struct sockaddr_in));
  }
  Address(const std::string& ip, const int port, const std::string& mac = "");
  ~Address();

  std::string GetIPAsString();
  int GetPortAsInt();
};

#endif

================================================
FILE: lib/common_struct.h
================================================


#ifndef NEZHA_COMMON_STRUCT_H
#define NEZHA_COMMON_STRUCT_H
#include <openssl/sha.h>
#include <stdio.h>
#include <stdlib.h>
#include <cstring>
#include <string>
#include <vector>
#include "lib/common_type.h"

/**
 * Nezha relies on proto messages to communicate.
 * When the proto message has been serialized and is about to be sent by the
 * endpoint, MessageHeader is prepended to the head of the proto message (refer
 * to SendMsgTo in udp_socket_endpoint.h), which describes the type of proto
 * message and its length. In this way, when the receiver endpoint receives the
 * message, it can know the type and length of the proto message, then it can
 * choose the proper way to deserialize it.
 */
struct MessageHeader {
  char msgType;
  uint32_t msgLen;
  MessageHeader(const char t, const uint32_t l) : msgType(t), msgLen(l) {}
};

/**
 * SHA_HASH is included in the FastReply message to represent the replica state
 * of replica. More details at Sec 5.2 of our paper
 * https://arxiv.org/pdf/2206.03285.pdf
 */
union SHA_HASH {
  uint32_t item[5];
  unsigned char hash[SHA_DIGEST_LENGTH];
  SHA_HASH() { memset(item, 0, sizeof(uint32_t) * 5); }
  SHA_HASH(const char* str, const uint32_t len) {
    if (len >= SHA_DIGEST_LENGTH) {
      memcpy(hash, str, SHA_DIGEST_LENGTH);
    } else {
      memcpy(hash, str, len);
    }
  }
  SHA_HASH(const SHA_HASH& h) { memcpy(item, h.item, sizeof(uint32_t) * 5); }
  SHA_HASH& operator=(const SHA_HASH& sh) {
    memcpy(item, sh.item, sizeof(uint32_t) * 5);
    return *this;
  }
  void XOR(const SHA_HASH& h) {
    item[0] ^= h.item[0];
    item[1] ^= h.item[1];
    item[2] ^= h.item[2];
    item[3] ^= h.item[3];
    item[4] ^= h.item[4];
  }
  std::string toString() {
    return (std::to_string(item[0]) + "-" + std::to_string(item[1]) + "-" +
            std::to_string(item[2]) + "-" + std::to_string(item[3]) + "-" +
            std::to_string(item[4]));
  }
};

/** When request is received by the replica, it will be first converted to
 * RequestBody, which includes all the useful information of the request */
struct RequestBody {
  uint64_t deadline;
  uint64_t reqKey;  // reqKey uniquely identifies the request on this replica,
                    // it is concated by the clientId and reqId. With reqKey,
                    // the replica can easily check whether this request has
                    // been previously received or not.
  uint32_t opKey;   // opKey indicates which key the request is operating on (
                    // imagine we are working on a database system and different
                    // requests wil read/write different keys). opKey is
                    // important for commutativity optimization. dd
  uint64_t proxyId;     // proxyId indicates which proxy delivers the request to
                        // the replica, and later replicas will send the
                        // corresponding reply to the proxy.
  std::string command;  // command is the content to execute
  bool isWrite;
  RequestBody() {}
  RequestBody(const uint64_t d, const uint64_t r, const uint32_t ok,
              const uint64_t p, const std::string& cmd, const bool isw)
      : deadline(d),
        reqKey(r),
        opKey(ok),
        proxyId(p),
        command(cmd),
        isWrite(isw) {}

  /** The following methods are used to compare different requests so as to
   * decide their order*/
  bool LessThan(const RequestBody& bigger) {
    return (deadline < bigger.deadline ||
            (deadline == bigger.deadline && reqKey < bigger.reqKey));
  }
  bool LessThan(const std::pair<uint64_t, uint64_t>& bigger) {
    return (deadline < bigger.first ||
            (deadline == bigger.first && reqKey < bigger.second));
  }
  bool LessOrEqual(const RequestBody& bigger) {
    return (deadline < bigger.deadline ||
            (deadline == bigger.deadline && reqKey <= bigger.reqKey));
  }
  bool LessOrEqual(const std::pair<uint64_t, uint64_t>& bigger) {
    return (deadline < bigger.first ||
            (deadline == bigger.first && reqKey <= bigger.second));
  }
};

/**
 * After RequestBody is processed and eventually replied, it will be converted
 * into a LogEntry, and stored in the replica.
 * LogEntry, compares with RequestBody, includes more information
 */
struct LogEntry {
  // Request Body
  RequestBody body;
  SHA_HASH entryHash;  // The hash value of this **single** entry
  SHA_HASH logHash;  // The accumulative hash, which is calculated based on all
                     // the log entries from the beginning to this entry

  /** prevNonCommutative and nextNonCommutative organize the LogEntries as a
   * skiplist, and easier and more efficient to traverse/modify/delete */
  LogEntry* prevNonCommutative;  // The previous non-commutative entry
  LogEntry* nextNonCommutative;  // The next non-commutative entry

  LogEntry* prevNonCommutativeWrite;  // The entry's prevNonCommutative may be a
                                      // write, or may be a read
  // But only the prevNonCommutativeWrite is used to calculate the incremental
  // hash, see Sec 8.2 of Nezha's Technical Report
  LogEntry* nextNonCommutativeWrite;

  /** prev and next organizes the LogEntries as a link list, and easier to
   * traverse/modify/delete */

  LogEntry* prev;  // The previous LogEntry pointer
  LogEntry* next;  // The next LogEntry pointer

  std::string result;  // The execution result of the LogEntry
  char status;         //
  uint32_t logId;  // The logId (the position of the LogEntry in the list) of
                   // the entry

  LogEntry()
      : prevNonCommutative(NULL),
        nextNonCommutative(NULL),
        prevNonCommutativeWrite(NULL),
        nextNonCommutativeWrite(NULL),
        prev(NULL),
        next(NULL),
        result(""),
        status(EntryStatus::INITIAL),
        logId(0) {}
  LogEntry(const RequestBody& rb, const SHA_HASH& eh, const SHA_HASH& h,
           LogEntry* prevNonComm = NULL, LogEntry* nextNonComm = NULL,
           LogEntry* preNonCOmmW = NULL, LogEntry* nextNonCommW = NULL,
           LogEntry* pre = NULL, LogEntry* nxt = NULL,
           const std::string& re = "", const char sts = EntryStatus::INITIAL,
           const uint32_t lid = 0)
      : body(rb),
        entryHash(eh),
        logHash(h),
        prevNonCommutative(prevNonComm),
        nextNonCommutative(nextNonComm),
        prevNonCommutativeWrite(preNonCOmmW),
        nextNonCommutativeWrite(nextNonCommW),
        prev(pre),
        next(nxt),
        result(re),
        status(sts),
        logId(lid) {}
  LogEntry(const uint64_t d, const uint64_t r, const uint32_t ok,
           const uint64_t p, const std::string& cmd, const bool& isw,
           const SHA_HASH& eh, const SHA_HASH& h, LogEntry* prevNonComm = NULL,
           LogEntry* nextNonComm = NULL, LogEntry* preNonCOmmW = NULL,
           LogEntry* nextNonCommW = NULL, LogEntry* pre = NULL,
           LogEntry* nxt = NULL, const std::string& re = "",
           const char sts = EntryStatus::INITIAL, const uint32_t lid = 0)
      : body(d, r, ok, p, cmd, isw),
        entryHash(eh),
        logHash(h),
        prevNonCommutative(prevNonComm),
        nextNonCommutative(nextNonComm),
        prevNonCommutativeWrite(preNonCOmmW),
        nextNonCommutativeWrite(nextNonCommW),
        prev(pre),
        next(nxt),
        result(re),
        status(sts),
        logId(lid) {}

  bool LessThan(const LogEntry& bigger) { return body.LessThan(bigger.body); }
  bool LessThan(const std::pair<uint64_t, uint64_t>& bigger) {
    return body.LessThan(bigger);
  }
  bool LessOrEqual(const LogEntry& bigger) {
    return body.LessOrEqual(bigger.body);
  }
  bool LessOrEqual(const std::pair<uint64_t, uint64_t>& bigger) {
    return body.LessOrEqual(bigger);
  }
};

/**
 * CrashVectorStruct is necessary for Nezha to avoid stray messages, details in
 * Appendix A.1 and Appendix J of our paper
 */
struct CrashVectorStruct {
  std::vector<uint32_t> cv_;
  uint32_t version_;  // Newer crash vector will have a larger version_
  SHA_HASH cvHash_;
  CrashVectorStruct(const std::vector<uint32_t>& c, const uint32_t v)
      : cv_(c), version_(v) {
    const uint32_t contentLen = c.size() * sizeof(uint32_t);
    const unsigned char* content = (const unsigned char*)(void*)(c.data());
    SHA1(content, contentLen, cvHash_.hash);
  }
  CrashVectorStruct(const CrashVectorStruct& c)
      : cv_(c.cv_), version_(c.version_), cvHash_(c.cvHash_) {}
};

#endif

================================================
FILE: lib/common_type.h
================================================
#ifndef NEZHA_COMMON_TYPE_H
#define NEZHA_COMMON_TYPE_H

/** We currently only support UDP endpoint, and GRPC endpoint will be supported
 * in the near future*/
enum EndpointType {
  UDP_ENDPOINT = 1,
  GRPC_ENDPOINT  // To be supported
};

/** Refer to Sec 5 of our paper for detailed explanation of different replica
 * statuses */
enum ReplicaStatus { NORMAL = 1, VIEWCHANGE, RECOVERING, TERMINATED };

/** A LogEntry is INITIAL at the beginning, then it may switch to either
 * IN_PROCESS->PROCESSED->REPLIED  or directly IN_LATEBUFFER */
enum EntryStatus {
  INITIAL = 1,
  IN_PROCESS,
  IN_LATEBUFFER,
  PROCESSED,
  TO_SLOW_REPLY,
  REPLIED
};

/**
 * The message types are defined according to the proto files and the
 * information will be included in each message to facilitate
 * serialize/deserialize proto messages
 */
enum MessageType {
  CLIENT_REQUEST = 1,
  LEADER_REQUEST,
  SYNC_INDEX,
  MISSED_INDEX_ASK,
  MISSED_REQ_ASK,
  FAST_REPLY,
  SLOW_REPLY,
  COMMIT_REPLY,
  MISSED_REQ,
  VIEWCHANGE_REQ,
  VIEWCHANGE_MSG,
  START_VIEW,
  STATE_TRANSFER_REQUEST,
  STATE_TRANSFER_REPLY,
  CRASH_VECTOR_REQUEST,
  CRASH_VECTOR_REPLY,
  RECOVERY_REQUEST,
  RECOVERY_REPLY,
  SYNC_STATUS_REPORT,
  COMMIT_INSTRUCTION,
  SUSPEND_REPLY,
  ERROR_MSG
};

#endif

================================================
FILE: lib/endpoint.cc
================================================
#include "lib/endpoint.h"

Endpoint::Endpoint(const std::string& sip, const int sport,
                   const bool isMasterReceiver)
    : addr_(sip, sport) {
  evLoop_ = isMasterReceiver ? ev_default_loop() : ev_loop_new();
  if (!evLoop_) {
    LOG(ERROR) << "Event Loop error";
    return;
  }
}

Endpoint::~Endpoint() {
  LoopBreak();
  ev_loop_destroy(evLoop_);
}

bool Endpoint::RegisterTimer(Timer* timer) {
  if (evLoop_ == NULL) {
    LOG(ERROR) << "No evLoop!";
    return false;
  }

  if (isTimerRegistered(timer)) {
    LOG(ERROR) << "This timer has already been registered";
    return false;
  }

  timer->attachedEndpoint_ = this;
  eventTimers_.insert(timer);
  ev_timer_again(evLoop_, timer->evTimer_);
  return true;
}

bool Endpoint::UnRegisterTimer(Timer* timer) {
  if (evLoop_ == NULL) {
    LOG(ERROR) << "No evLoop!";
    return false;
  }
  if (!isTimerRegistered(timer)) {
    LOG(ERROR) << "The timer has not been registered ";
    return false;
  }
  ev_timer_stop(evLoop_, timer->evTimer_);
  eventTimers_.erase(timer);
  return true;
}

void Endpoint::UnRegisterAllTimers() {
  for (auto& t : eventTimers_) {
    ev_timer_stop(evLoop_, t->evTimer_);
  }
  eventTimers_.clear();
}

bool Endpoint::isTimerRegistered(Timer* timer) {
  return (eventTimers_.find(timer) != eventTimers_.end());
}

void Endpoint::LoopRun() { ev_run(evLoop_, 0); }

void Endpoint::LoopBreak() {
  UnRegisterAllTimers();
  ev_break(evLoop_, EVBREAK_ALL);
}


================================================
FILE: lib/endpoint.h
================================================
#ifndef NEZHA_ENDPOINT_H
#define NEZHA_ENDPOINT_H

#include <arpa/inet.h>
#include <ev.h>
#include <fcntl.h>
#include <glog/logging.h>
#include <google/protobuf/message.h>
#include <netinet/in.h>
#include <functional>
#include <set>
#include <string>
#include "lib/address.h"
#include "lib/common_struct.h"
#include "lib/message_handler.h"
#include "lib/timer.h"

/**
 * Endpoint is the basic abstraction, and it can be derived to more specific
 * endpoints, based on the communication primtive (e.g., UDPSocketEndpoint)
 *
 * An Endpoint supports three major functionalities:
 * (1) Send/Receive messages;
 * (2) Process the received messages according to (pre-registered) customized
 * message handlers;
 * (3) Conduct periodical actions according to (pre-registered)
 * customized timer functions.
 */
class Endpoint {
 protected:
  /* The address of this endpoint */
  Address addr_;
  /** The socket fd it uses to send/recv messages */
  int fd_;
  /** The ev_loop struct from libev, which uses to handle io/timer events */
  struct ev_loop* evLoop_;
  /** One endpoint can have multiple timers registered. We maintain a set to
   * avoid duplicate registration and check whether a specific timer has been
   * registered or not.*/
  std::set<struct Timer*> eventTimers_;

 public:
  int epId_;  // The id of the endpoint, mainly for debug
  /** The endpoint accepts an ip and port. If both are valid, it binds the
   * socket fd to the ip:port. If isMasterReceiver is true, it creates the
   * default loop with libev, otherwise, it creates new loop (refer to libev
   * documentation for detailed explanation at
   * https://metacpan.org/dist/EV/view/libev/ev.pod) */
  Endpoint(const std::string& ip = "", const int port = -1,
           const bool isMasterReceiver = false);
  virtual ~Endpoint();

  /** Send the message to the specific destination. The method needs to know the
   * message type (3rd parameter) and include such information in the buffer */
  virtual int SendMsgTo(const Address& dstAddr,
                        const google::protobuf::Message& msg,
                        const char msgType) = 0;

  /** An endpoint potentially can have multiple message handlers registered, but
   * our UDPSocketEndpoint implementation only supports at most one
   * message handler for one endpoint. So we make them as virtual functions and
   * different derived classes have their own implementation of the methods */
  virtual bool RegisterMsgHandler(MessageHandler* msgHdl) = 0;
  virtual bool UnRegisterMsgHandler(MessageHandler* msgHdl) = 0;
  virtual bool isMsgHandlerRegistered(MessageHandler* msgHdl) = 0;
  virtual void UnRegisterAllMsgHandlers() = 0;

  /** Return true if the timer is successfully registered, otherwise (e.g. it
   * has been registered before and has not been unreigstered), return false */
  bool RegisterTimer(Timer* timer);
  /** Return true if the timer is successfully registered, otherwise (e.g. the
   * timer has not been registered before), return false */
  bool UnRegisterTimer(Timer* timer);
  /** Check whether the timer has been registered */
  bool isTimerRegistered(Timer* timer);
  void UnRegisterAllTimers();

  void LoopRun();
  void LoopBreak();
};

#endif

================================================
FILE: lib/message_handler.h
================================================

#ifndef NEZHA_MESSAGE_HANDLER_H
#define NEZHA_MESSAGE_HANDLER_H

#include <arpa/inet.h>
#include <ev.h>
#include <fcntl.h>
#include <glog/logging.h>
#include <google/protobuf/message.h>
#include <netinet/in.h>
#include <functional>
#include <set>
#include <string>
#include "lib/address.h"
#include "lib/common_type.h"

/**
 * MessageHandler is an encapsulation of libev-based message handler (i.e.
 * ev_io).
 *
 * After the message handler is created, it will be registered to a
 * specific endpoint. Then, the callback func (i.e., MessageHandlerFunc) will be
 * called every time this endpoint receives some messages.
 *
 * Currently, we only support UDP communication. Therefore, we only have one
 * derived struct (UDPMsgHandler) from MessageHandler
 *
 * We will continue to support other types of endpoints. Correspondingly, there
 * will be more derived struct added later
 * **/

/**
 * Para-1: MessageHeader* describes the type and length of the received message
 * Para-2: char* is the payload of the message
 * Para-3: Address* is the address of the sender
 * Para-4: void* points to the (optional) context that is needed by the callback
 * function(i.e., MessageHandlerFunc)
 */
typedef std::function<void(MessageHeader*, char*, Address*, void*)>
    MessageHandlerFunc;

struct MessageHandler {
  MessageHandlerFunc msgHandler_;
  void* context_;
  Address sender_;
  struct ev_io* evWatcher_;
  MessageHandler(MessageHandlerFunc msghdl, void* ctx = NULL)
      : msgHandler_(msghdl), context_(ctx) {
    evWatcher_ = new ev_io();
    evWatcher_->data = (void*)this;
  }
  ~MessageHandler() { delete evWatcher_; }
};

struct UDPMsgHandler : MessageHandler {
  char buffer_[UDP_BUFFER_SIZE];
  UDPMsgHandler(MessageHandlerFunc msghdl, void* ctx = NULL)
      : MessageHandler(msghdl, ctx) {
    ev_init(evWatcher_, [](struct ev_loop* loop, struct ev_io* w, int revents) {
      UDPMsgHandler* m = (UDPMsgHandler*)(w->data);
      socklen_t sockLen = sizeof(struct sockaddr_in);
      int msgLen = recvfrom(w->fd, m->buffer_, UDP_BUFFER_SIZE, 0,
                            (struct sockaddr*)(&(m->sender_.addr_)), &sockLen);
      if (msgLen > 0 && (uint32_t)msgLen > sizeof(MessageHeader)) {
        MessageHeader* msgHeader = (MessageHeader*)(void*)(m->buffer_);
        if (msgHeader->msgLen + sizeof(MessageHeader) >= (uint32_t)msgLen) {
          m->msgHandler_(msgHeader, m->buffer_ + sizeof(MessageHeader),
                         &(m->sender_), m->context_);
        }
      }
    });
  }
  ~UDPMsgHandler() {}
};

#endif

================================================
FILE: lib/message_type.cc
================================================
#include "lib/message_type.h"


namespace MessageType {
    char CLIENT_REQUEST = 1;
    char LEADER_REQUEST = 2;
    char SYNC_INDEX = 3;
    char MISSED_INDEX_ASK = 4;
    char MISSED_REQ_ASK = 5;
    char FAST_REPLY = 6;
    char SLOW_REPLY = 7;
    char COMMIT_REPLY = 8;
    char MISSED_REQ = 9;
    char VIEWCHANGE_REQ = 10;
    char VIEWCHANGE = 11;
    char START_VIEW = 12;
    char STATE_TRANSFER_REQUEST = 13;
    char STATE_TRANSFER_REPLY = 14;
    char CRASH_VECTOR_REQUEST = 15;
    char CRASH_VECTOR_REPLY = 16;
    char RECOVERY_REQUEST = 17;
    char RECOVERY_REPLY = 18;
    char SYNC_STATUS_REPORT = 19;
    char COMMIT_INSTRUCTION = 20;
    char SUSPEND_REPLY = 21;
    char ERROR_MSG = 22;
};

================================================
FILE: lib/message_type.h
================================================
#include <stdint.h>
#ifndef NEZHA_MESSAGE_TYPE_H
#define NEZHA_MESSAGE_TYPE_H

#define CONCURRENT_MAP_START_INDEX (2u)
#define CONCAT_UINT32(a, b) ((((uint64_t)a)<<32u)|(uint32_t)b)
#define HIGH_32BIT(a) ((uint32_t)(a>>32))
#define LOW_32BIT(a) ((uint32_t)a)

struct MessageHeader {
    char msgType;
    uint32_t msgLen;
    MessageHeader(const char t, const uint32_t l) :msgType(t), msgLen(l) {}
};


namespace MessageType {
    extern char  CLIENT_REQUEST;
    extern char  LEADER_REQUEST;
    extern char  SYNC_INDEX;
    extern char  MISSED_INDEX_ASK;
    extern char  MISSED_REQ_ASK;
    extern char  FAST_REPLY;
    extern char  SLOW_REPLY;
    extern char  COMMIT_REPLY;
    extern char  MISSED_REQ;
    extern char  VIEWCHANGE_REQ;
    extern char  VIEWCHANGE;
    extern char  START_VIEW;
    extern char  STATE_TRANSFER_REQUEST;
    extern char  STATE_TRANSFER_REPLY;
    extern char  CRASH_VECTOR_REQUEST;
    extern char  CRASH_VECTOR_REPLY;
    extern char  RECOVERY_REQUEST;
    extern char  RECOVERY_REPLY;
    extern char  SYNC_STATUS_REPORT;
    extern char  COMMIT_INSTRUCTION;
    extern char  SUSPEND_REPLY;
    extern char  ERROR_MSG;
};

#endif

================================================
FILE: lib/timer.h
================================================
#ifndef NEZHA_TIMER_
#define NEZHA_TIMER_

#include <arpa/inet.h>
#include <ev.h>
#include <fcntl.h>
#include <glog/logging.h>
#include <google/protobuf/message.h>
#include <netinet/in.h>
#include <functional>
#include <set>
#include <string>
#include "lib/address.h"
#include "lib/common_type.h"

/**
 * Timer is an encapsulation of libev-based message handler (i.e.
 * ev_timer).
 *
 * After the timer is created, it will be registered to a
 * specific endpoint, together with a period (measures in milliseconds). Then,
 * the callback func (i.e., TimerFunc) will be called periodically until the
 * timer is unregistered
 * **/

/**
 * Para-1: The first void* points to the context, that may be needed by the
 * callback function(i.e., TimerFunc)
 * Para-2: The first void* points to the endpoint that this timer is attached
 * to. It can be passed into the function as NULL if the TimerFunc does not need
 * it. But some TimerFunc (e.g., monitorTimer in replica) callback needs to know
 * the endpoint it has attached to.
 */

typedef std::function<void(void*, void*)> TimerFunc;

struct Timer {
  std::function<void(void*, void*)> timerFunc_;
  void* context_;
  void* attachedEndpoint_;
  struct ev_timer* evTimer_;

  Timer(TimerFunc timerf, uint32_t periodMs = 1, void* ctx = NULL,
        void* aep = NULL)
      : timerFunc_(timerf), context_(ctx), attachedEndpoint_(aep) {
    evTimer_ = new ev_timer();
    evTimer_->data = (void*)this;
    evTimer_->repeat = periodMs * 1e-3;
    ev_init(evTimer_,
            [](struct ev_loop* loop, struct ev_timer* w, int revents) {
              Timer* t = (Timer*)(w->data);
              t->timerFunc_(t->context_, t->attachedEndpoint_);
            });
  }
  ~Timer() { delete evTimer_; }
};

#endif

================================================
FILE: lib/udp_socket_endpoint.cc
================================================
#include "lib/udp_socket_endpoint.h"

UDPSocketEndpoint::UDPSocketEndpoint(const std::string& ip, const int port,
                                     const bool isMasterReceiver)
    : Endpoint(ip, port, isMasterReceiver), msgHandler_(NULL) {
  fd_ = socket(PF_INET, SOCK_DGRAM, 0);
  if (fd_ < 0) {
    LOG(ERROR) << "Receiver Fd fail ";
    return;
  }
  // Set Non-Blocking
  int status = fcntl(fd_, F_SETFL, fcntl(fd_, F_GETFL, 0) | O_NONBLOCK);
  if (status < 0) {
    LOG(ERROR) << " Set NonBlocking Fail";
  }
  if (ip == "" || port < 0) {
    return;
  }
  struct sockaddr_in addr;
  bzero(&addr, sizeof(addr));
  addr.sin_family = AF_INET;
  addr.sin_port = htons(port);
  addr.sin_addr.s_addr = inet_addr(ip.c_str());
  // Bind socket to Address
  int bindRet = bind(fd_, (struct sockaddr*)&addr, sizeof(addr));
  if (bindRet != 0) {
    LOG(ERROR) << "bind error\t" << bindRet << "\t port=" << port;
    return;
  }
}

UDPSocketEndpoint::~UDPSocketEndpoint() {}

int UDPSocketEndpoint::SendMsgTo(const Address& dstAddr,
                                 const google::protobuf::Message& msg,
                                 char msgType) {
  char buffer[UDP_BUFFER_SIZE];
  MessageHeader* msgHdr = (MessageHeader*)(void*)buffer;
  msgHdr->msgType = msgType;
  std::string serializedString = msg.SerializeAsString();
  msgHdr->msgLen = serializedString.length();
  if (serializedString.length() + sizeof(MessageHeader) > UDP_BUFFER_SIZE) {
    LOG(ERROR) << "Msg too large " << (uint32_t)msgType
               << "\t length=" << serializedString.length();
    return -1;
  }
  if (msgHdr->msgLen > 0) {
    // Serialization succeed
    // Prepend MesageHeader to the serialized string
    memcpy(buffer + sizeof(MessageHeader), serializedString.c_str(),
           msgHdr->msgLen);
    int ret = sendto(fd_, buffer, msgHdr->msgLen + sizeof(MessageHeader), 0,
                     (struct sockaddr*)(&(dstAddr.addr_)), sizeof(sockaddr_in));
    if (ret < 0) {
      VLOG(1) << pthread_self() << "\tSend Fail ret =" << ret;
    }
    return ret;
  }
  return -1;
}

bool UDPSocketEndpoint::RegisterMsgHandler(MessageHandler* msgHdl) {
  UDPMsgHandler* udpMsgHdl = (UDPMsgHandler*)msgHdl;
  if (evLoop_ == NULL) {
    LOG(ERROR) << "No evLoop!";
    return false;
  }
  if (isMsgHandlerRegistered(msgHdl)) {
    LOG(ERROR) << "This msgHdl has already been registered";
    return false;
  }

  msgHandler_ = udpMsgHdl;
  ev_io_set(udpMsgHdl->evWatcher_, fd_, EV_READ);
  ev_io_start(evLoop_, udpMsgHdl->evWatcher_);

  return true;
}

bool UDPSocketEndpoint::UnRegisterMsgHandler(MessageHandler* msgHdl) {
  UDPMsgHandler* udpMsgHdl = (UDPMsgHandler*)msgHdl;
  if (evLoop_ == NULL) {
    LOG(ERROR) << "No evLoop!";
    return false;
  }
  if (!isMsgHandlerRegistered(udpMsgHdl)) {
    LOG(ERROR) << "The handler has not been registered ";
    return false;
  }
  ev_io_stop(evLoop_, udpMsgHdl->evWatcher_);
  msgHandler_ = NULL;
  return true;
}

bool UDPSocketEndpoint::isMsgHandlerRegistered(MessageHandler* msgHdl) {
  return (UDPMsgHandler*)msgHdl == msgHandler_;
}

void UDPSocketEndpoint::UnRegisterAllMsgHandlers() {
  ev_io_stop(evLoop_, msgHandler_->evWatcher_);
  msgHandler_ = NULL;
}


================================================
FILE: lib/udp_socket_endpoint.h
================================================
#ifndef NEZHA_UDP_SOCKET_SENDER_H
#define NEZHA_UDP_SOCKET_SENDER_H

#include "lib/endpoint.h"

class UDPSocketEndpoint : public Endpoint {
 private:
  /* data */
  struct UDPMsgHandler* msgHandler_;

 public:
  UDPSocketEndpoint(const std::string& ip = "", const int port = -1,
                    const bool isMasterReceiver = false);
  ~UDPSocketEndpoint();

  int SendMsgTo(const Address& dstAddr, const google::protobuf::Message& msg,
                const char msgType) override;
  bool RegisterMsgHandler(MessageHandler* msgHdl) override;
  bool UnRegisterMsgHandler(MessageHandler* msgHdl) override;
  bool isMsgHandlerRegistered(MessageHandler* msgHdl) override;
  void UnRegisterAllMsgHandlers() override;
};

#endif

================================================
FILE: lib/utils.cc
================================================
#include "lib/utils.h"

SHA_HASH CalculateHash(uint64_t deadline, uint64_t reqKey) {
  SHA_HASH hash;
  const uint32_t contentLen =
      sizeof(uint64_t) + sizeof(uint32_t) + sizeof(uint32_t);
  unsigned char content[contentLen];
  memcpy(content, &deadline, sizeof(uint64_t));
  memcpy(content + sizeof(uint64_t), &reqKey, sizeof(uint64_t));
  SHA1(content, contentLen, hash.hash);
  return hash;
}

// Get Current Microsecond Timestamp
uint64_t GetMicrosecondTimestamp() {
  auto tse = std::chrono::system_clock::now().time_since_epoch();
  return std::chrono::duration_cast<std::chrono::microseconds>(tse).count();
}

Endpoint* CreateEndpoint(const char endpointType, const std::string& sip,
                         const int sport, const bool isMasterReceiver) {
  if (endpointType == EndpointType::UDP_ENDPOINT) {
    return new UDPSocketEndpoint(sip, sport, isMasterReceiver);
  } else if (endpointType == EndpointType::GRPC_ENDPOINT) {
    // To support GRPC later
    return NULL;
  } else {
    LOG(ERROR) << "Unknown endpoint type: " << endpointType;
    return NULL;
  }
}

MessageHandler* CreateMsgHandler(const char endpointType,
                                 MessageHandlerFunc msghdl, void* ctx) {
  if (endpointType == EndpointType::UDP_ENDPOINT) {
    return new UDPMsgHandler(msghdl, ctx);
  } else if (endpointType == EndpointType::GRPC_ENDPOINT) {
    // To support GRPC later
    return NULL;
  } else {
    LOG(ERROR) << "Unknown endpoint type: " << endpointType;
    return NULL;
  }
}


================================================
FILE: lib/utils.h
================================================
#ifndef NEZHA_UTILS_H
#define NEZHA_UTILS_H

#include <arpa/inet.h>
#include <ev.h>
#include <glog/logging.h>
#include <junction/ConcurrentMap_Leapfrog.h>
#include <netinet/in.h>
#include <openssl/sha.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <unistd.h>
#include <chrono>
#include <cstring>
#include "concurrentqueue.h"
#include "gflags/gflags.h"
#include "lib/udp_socket_endpoint.h"

template <typename T1>
using ConcurrentQueue = moodycamel::ConcurrentQueue<T1>;
template <typename T1, typename T2>
using ConcurrentMap = junction::ConcurrentMap_Leapfrog<T1, T2>;

/** The concurrent map we used (i.e.junction::ConcurrentMap) reserves 0 and 1 ,
 * so the start value should be 2 */
#define CONCURRENT_MAP_START_INDEX (2u)
#define CONCAT_UINT32(a, b) ((((uint64_t)a) << 32u) | (uint32_t)b)
/** Get the high/low 32bits of a uint64 */
#define HIGH_32BIT(a) ((uint32_t)(a >> 32))
#define LOW_32BIT(a) ((uint32_t)a)

// Since <deadline, reqKey> is sufficient to uniquely identify one request, we
// calculate hash based on them to represent the corresponding request/log
SHA_HASH CalculateHash(uint64_t deadline, uint64_t reqKey);

// Get Current Microsecond Timestamp
uint64_t GetMicrosecondTimestamp();

// Factory function, to create different types of endpoints and msghandlers
Endpoint* CreateEndpoint(const char endpointType, const std::string& sip = "",
                         const int sport = -1,
                         const bool isMasterReceiver = false);

MessageHandler* CreateMsgHandler(
    const char endpointType,
    std::function<void(MessageHeader*, char*, Address*, void*)> msghdl,
    void* ctx = NULL);

#endif

================================================
FILE: lib/zipfian.h
================================================
/*
 * MIT License
 *
 * Copyright (c) 2017 Lucas Lersch
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

 /* Implementation derived from:
  * "Quickly Generating Billion-Record Synthetic Databases", Jim Gray et al,
  * SIGMOD 1994
  */

  /*
   * The zipfian_int_distribution class is intended to be compatible with other
   * distributions introduced in #include <random> by the C++11 standard.
   *
   * Usage example:
   * #include <random>
   * #include "zipfian_int_distribution.h"
   * int main()
   * {
   *   std::default_random_engine generator;
   *   zipfian_int_distribution<int> distribution(1, 10, 0.99);
   *   int i = distribution(generator);
   * }
   */

   /*
    * IMPORTANT: constructing the distribution object requires calculating the zeta
    * value which becomes prohibetively expensive for very large ranges. As an
    * alternative for such cases, the user can pass the pre-calculated values and
    * avoid the calculation every time.
    *
    * Usage example:
    * #include <random>
    * #include "zipfian_int_distribution.h"
    * int main()
    * {
    *   std::default_random_engine generator;
    *   zipfian_int_distribution<int>::param_type p(1, 1e6, 0.99, 27.000);
    *   zipfian_int_distribution<int> distribution(p);
    *   int i = distribution(generator);
    * }
    */

#include <cmath>
#include <limits>
#include <random>
#include <cassert>

template<typename _IntType = int>
class zipfian_int_distribution
{
    static_assert(std::is_integral<_IntType>::value, "Template argument not an integral type.");

public:
    /** The type of the range of the distribution. */
    typedef _IntType result_type;
    /** Parameter type. */
    struct param_type
    {
        typedef zipfian_int_distribution<_IntType> distribution_type;

        explicit param_type(_IntType __a = 0, _IntType __b = std::numeric_limits<_IntType>::max(), double __theta = 0.99)
            : _M_a(__a), _M_b(__b), _M_theta(__theta),
            _M_zeta(zeta(_M_b - _M_a + 1, __theta)), _M_zeta2theta(zeta(2, __theta))
        {
            assert(_M_a <= _M_b && _M_theta > 0.0 && _M_theta < 1.0);
        }

        explicit param_type(_IntType __a, _IntType __b, double __theta, double __zeta)
            : _M_a(__a), _M_b(__b), _M_theta(__theta), _M_zeta(__zeta),
            _M_zeta2theta(zeta(2, __theta))
        {
            __glibcxx_assert(_M_a <= _M_b && _M_theta > 0.0 && _M_theta < 1.0);
        }

        result_type	a() const { return _M_a; }

        result_type	b() const { return _M_b; }

        double theta() const { return _M_theta; }

        double zeta() const { return _M_zeta; }

        double zeta2theta() const { return _M_zeta2theta; }

        friend bool	operator==(const param_type& __p1, const param_type& __p2)
        {
            return __p1._M_a == __p2._M_a
                && __p1._M_b == __p2._M_b
                && __p1._M_theta == __p2._M_theta
                && __p1._M_zeta == __p2._M_zeta
                && __p1._M_zeta2theta == __p2._M_zeta2theta;
        }

    private:
        _IntType _M_a;
        _IntType _M_b;
        double _M_theta;
        double _M_zeta;
        double _M_zeta2theta;

        /**
         * @brief Calculates zeta.
         *
         * @param __n [IN]  The size of the domain.
         * @param __theta [IN]  The skew factor of the distribution.
         */
        double zeta(unsigned long __n, double __theta)
        {
            double ans = 0.0;
            for (unsigned long i = 1; i <= __n; ++i)
                ans += std::pow(1.0 / i, __theta);
            return ans;
        }
    };

public:
    /**
     * @brief Constructs a zipfian_int_distribution object.
     *
     * @param __a [IN]  The lower bound of the distribution.
     * @param __b [IN]  The upper bound of the distribution.
     * @param __theta [IN]  The skew factor of the distribution.
     */
    explicit zipfian_int_distribution(_IntType __a = _IntType(0), _IntType __b = _IntType(1), double __theta = 0.99)
        : _M_param(__a, __b, __theta)
    { }

    explicit zipfian_int_distribution(const param_type& __p) : _M_param(__p)
    { }

    /**
     * @brief Resets the distribution state.
     *
     * Does nothing for the zipfian int distribution.
     */
    void reset() { }

    result_type a() const { return _M_param.a(); }

    result_type b() const { return _M_param.b(); }

    double theta() const { return _M_param.theta(); }

    /**
     * @brief Returns the parameter set of the distribution.
     */
    param_type param() const { return _M_param; }

    /**
     * @brief Sets the parameter set of the distribution.
     * @param __param The new parameter set of the distribution.
     */
    void param(const param_type& __param) { _M_param = __param; }

    /**
     * @brief Returns the inclusive lower bound of the distribution range.
     */
    result_type min() const { return this->a(); }

    /**
     * @brief Returns the inclusive upper bound of the distribution range.
     */
    result_type max() const { return this->b(); }

    /**
     * @brief Generating functions.
     */
    template<typename _UniformRandomNumberGenerator>
    result_type operator()(_UniformRandomNumberGenerator& __urng)
    {
        return this->operator()(__urng, _M_param);
    }

    template<typename _UniformRandomNumberGenerator>
    result_type operator()(_UniformRandomNumberGenerator& __urng, const param_type& __p)
    {
        double alpha = 1 / (1 - __p.theta());
        double eta = (1 - std::pow(2.0 / (__p.b() - __p.a() + 1), 1 - __p.theta())) / (1 - __p.zeta2theta() / __p.zeta());

        double u = std::generate_canonical<double, std::numeric_limits<double>::digits, _UniformRandomNumberGenerator>(__urng);

        double uz = u * __p.zeta();
        if (uz < 1.0) return __p.a();
        if (uz < 1.0 + std::pow(0.5, __p.theta())) return __p.a() + 1;

        return __p.a() + ((__p.b() - __p.a() + 1) * std::pow(eta * u - eta + 1, alpha));
    }

    /**
     * @brief Return true if two zipfian int distributions have
     *        the same parameters.
     */
    friend bool operator==(const zipfian_int_distribution& __d1, const zipfian_int_distribution& __d2)
    {
        return __d1._M_param == __d2._M_param;
    }

private:
    param_type _M_param;
};


================================================
FILE: license.md
================================================
MIT License

Copyright (c) 2022-2024 Jinkun Geng

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

================================================
FILE: micro-bench/BUILD
================================================
cc_binary(
    name = "bench_sender",
    srcs = ["bench_sender.cc"],
    deps = [
        "//proto:nezha_cc_proto",
        "//lib:utils",
        "//lib:address",
        "//lib:zipfian",
    ],
    copts = [
        "-I/usr/local/include"
    ],
    linkopts = [ "-L/usr/local/lib", "-lev", "-ldl", "-lturf", "-ljunction", 
                "-lcrypto", "-lgflags",  "-lglog",  "-lyaml-cpp", "-pthread" ],

)


cc_binary(
    name = "bench_receiver",
    srcs = ["bench_receiver.cc"],
    deps = [
        "//proto:nezha_cc_proto",
        "//lib:utils",
        "//lib:address",
        "//lib:zipfian",
    ],
    copts = [
        "-I/usr/local/include"
    ],
    linkopts = [ "-L/usr/local/lib", "-lev", "-ldl", "-lturf", "-ljunction", 
                "-lcrypto", "-lgflags",  "-lglog",  "-lyaml-cpp", "-pthread" ],

)


cc_binary(
    name = "analysis",
    srcs = ["analysis.cc"],
    deps = [
        "//proto:nezha_cc_proto",
        "//lib:utils",
        "//lib:address",
        "//lib:zipfian",
    ],
    copts = [
        "-I/usr/local/include"
    ],
    linkopts = [ "-L/usr/local/lib", "-lev", "-ldl", "-lturf", "-ljunction", 
                "-lcrypto", "-lgflags",  "-lglog",  "-lyaml-cpp", "-pthread" ],

)


================================================
FILE: micro-bench/analysis.cc
================================================
#include <fstream>
#include <iostream>
#include "lib/utils.h"
#include "lib/zipfian.h"
#include "proto/nezha_proto.pb.h"

DEFINE_string(folder, "/home/steam1994/micro-stats/2-10000-0-50",
              "The folder of the csv");

DEFINE_int32(replica_num, 2, "The number of replicas");

int main(int argc, char* argv[]) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);
  google::InitGoogleLogging(argv[0]);
  FLAGS_logtostderr = 1;
  //   std::vector<uint32_t> zipfianKeys;
  //   uint32_t keyNum = 1000000;
  //   zipfianKeys.resize(1000000, 0);
  //   uint32_t skewFactor = 0.5;
  //   if (keyNum > 1) {
  //     std::default_random_engine generator(1);  // clientId as the seed
  //     zipfian_int_distribution<uint32_t> zipfianDistribution(0, keyNum - 1,
  //                                                            skewFactor);
  //     for (uint32_t i = 0; i < zipfianKeys.size(); i++) {
  //       zipfianKeys[i] = zipfianDistribution(generator);
  //     }
  //   }

  std::string r0Fname = FLAGS_folder + "/" + "Replica-Stats-0.csv";
  std::ifstream ifs1(r0Fname);
  LOG(INFO) << "fname=" << r0Fname;
  uint32_t clientId, reqId;
  uint32_t id = 0;
  std::map<uint64_t, uint32_t> mapIdx;
  std::map<uint64_t, uint64_t> mapKey;
  while (ifs1 >> clientId >> reqId) {
    uint64_t reqKey = CONCAT_UINT32(clientId, reqId);
    mapIdx[reqKey] = id;
    id++;
  }

  for (int i = 1; i < FLAGS_replica_num; i++) {
    std::string r1Fname =
        FLAGS_folder + "/" + "Replica-Stats-" + std::to_string(i) + ".csv";
    std::ifstream ifs2(r1Fname);
    std::vector<uint64_t> reqKeys;
    reqKeys.reserve(100000);
    std::vector<uint32_t> mappedIds;
    mappedIds.reserve(100000);
    while (ifs2 >> clientId >> reqId) {
      uint64_t reqKey = CONCAT_UINT32(clientId, reqId);
      reqKeys.push_back(reqKey);
      mappedIds.push_back(mapIdx[reqKey]);
    }
    uint32_t reorderedCase = 0;
    for (uint32_t i = 1; i < reqKeys.size(); i++) {
      if (mappedIds[i] == 0 || mappedIds[i] < mappedIds[i - 1]) {
        reorderedCase++;
      }
    }
    LOG(INFO) << "reorderedCase=" << reorderedCase << "\t"
              << "total=" << id << "\t rate=" << reorderedCase * 1.0 / id;
  }
}

================================================
FILE: micro-bench/bench_receiver.cc
================================================
#include <fstream>
#include <iostream>
#include "lib/utils.h"
#include "lib/zipfian.h"
#include "proto/nezha_proto.pb.h"
DEFINE_string(receiver_ip, "127.0.0.1", "The ip address of the receiver");

DEFINE_int32(receiver_port, 33333, "The port of the receiver");

DEFINE_int32(replica_id, 1, "The id of the replica");
DEFINE_int32(enable_dom, 0, "Whether enable DOM");
DEFINE_int32(percentile, 50, "The percentile of the owd estimation");

DEFINE_int32(client_port, 33336,
             "The port of the client listens for OWD reply");

ConcurrentMap<uint32_t, Address*> clientAddrs;
ConcurrentQueue<std::pair<uint32_t, uint32_t>> owdQu;
ConcurrentQueue<nezha::proto::Request> processQu;
std::vector<std::pair<uint32_t, uint32_t>> traceVec;
void MsgHandlerFunc(MessageHeader* msgHeader, char* msgBuffer, Address* sender,
                    void* context) {
  if (msgHeader->msgType == MessageType::CLIENT_REQUEST &&
      msgHeader->msgLen > 0) {
    nezha::proto::Request request;
    if (request.ParseFromArray(msgBuffer, msgHeader->msgLen)) {
      if (clientAddrs.get(request.clientid()) == NULL) {
        Address* senderAddr =
            new Address(sender->GetIPAsString(), FLAGS_client_port);
        clientAddrs.assign(request.clientid(), senderAddr);
      }
      processQu.enqueue(request);
      uint64_t nowTime = GetMicrosecondTimestamp();
      if (nowTime > request.sendtime()) {
        uint32_t owd = nowTime - request.sendtime();
        owdQu.enqueue({request.clientid(), owd});
      }
    }
  }
}
void ProcessTd() {
  traceVec.reserve(10000000ul);
  nezha::proto::Request request;
  std::map<std::pair<uint64_t, uint64_t>, nezha::proto::Request> earlyBuffer;
  uint64_t startTime = GetMicrosecondTimestamp();
  LOG(INFO) << "FLAGS_enable_dom=" << FLAGS_enable_dom;
  while (true) {
    if (FLAGS_enable_dom == 1) {
      if (processQu.try_dequeue(request)) {
        uint64_t deadline = request.sendtime() + request.bound();
        uint64_t reqKey = CONCAT_UINT32(request.clientid(), request.reqid());
        earlyBuffer.insert({{deadline, reqKey}, request});
      }
      uint64_t nowTime = GetMicrosecondTimestamp();
      while (earlyBuffer.empty() == false &&
             earlyBuffer.begin()->first.first <= nowTime) {
        traceVec.push_back({earlyBuffer.begin()->second.clientid(),
                            earlyBuffer.begin()->second.reqid()});
        earlyBuffer.erase(earlyBuffer.begin());
        if (traceVec.size() >= 10000000ul) {
          break;
        }
      }
    } else {
      while (processQu.try_dequeue(request)) {
        traceVec.push_back({request.clientid(), request.reqid()});
        if (traceVec.size() >= 10000000ul) {
          break;
        }
      }
    }
    uint64_t nowTime = GetMicrosecondTimestamp();

    if (nowTime - startTime >= 60 * 1000ul * 1000ul ||
        traceVec.size() >= 10000000ul) {
      LOG(INFO) << "To terminated ..." << traceVec.size();
      std::ofstream ofs("Replica-Stats-" + std::to_string(FLAGS_replica_id) +
                        ".csv");
      // ofs << "ClientID,ReqID" << std::endl;
      for (auto& p : traceVec) {
        ofs << p.first << "\t" << p.second << std::endl;
      }
      ofs.close();
      exit(0);
    }
  }
}

void OWDTd() {
  std::pair<uint32_t, uint32_t> owdSample;
  std::map<uint32_t, std::vector<uint32_t>> owdMap;
  std::map<uint32_t, uint32_t> owdCnt;
  UDPSocketEndpoint* replyEP = dynamic_cast<UDPSocketEndpoint*>(
      CreateEndpoint(EndpointType::UDP_ENDPOINT));
  nezha::proto::Reply reply;
  reply.set_replicaid(FLAGS_replica_id);
  while (true) {
    if (owdQu.try_dequeue(owdSample)) {
      uint32_t senderId = owdSample.first;
      uint32_t owd = owdSample.second;
      if (owdMap.find(senderId) == owdMap.end()) {
        owdMap[senderId].resize(1000);
        owdCnt[senderId] = 0;
      }
      owdMap[senderId][owdCnt[senderId] % 1000] = owd;
      owdCnt[senderId]++;
      if (owdCnt[senderId] % 1000 == 0) {
        std::vector<uint32_t> temp = owdMap[senderId];
        sort(temp.begin(), temp.end());
        uint32_t estimate = temp[1000 * FLAGS_percentile / 100];
        reply.set_clientid(senderId);
        reply.set_owd(estimate +
                      10);  // plus the 3 * error bound (sigma1+sigma2), the
                            // sigma ranges 1-3, here we plus 10 to simulate it
        Address* clientAddr = clientAddrs.get(senderId);
        if (clientAddr) {
          // LOG(INFO) << "Send to " << senderId << "\t" << estimate;
          replyEP->SendMsgTo(*clientAddr, reply, MessageType::FAST_REPLY);
        }
      }
    }
  }
}

int main(int argc, char* argv[]) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);
  google::InitGoogleLogging(argv[0]);
  FLAGS_logtostderr = 1;
  std::thread* processTd = new std::thread(ProcessTd);
  std::thread* owdTd = new std::thread(OWDTd);
  Endpoint* requestEP = CreateEndpoint(
      EndpointType::UDP_ENDPOINT, FLAGS_receiver_ip, FLAGS_receiver_port, true);
  UDPMsgHandler* msgHandler = new UDPMsgHandler(MsgHandlerFunc);
  requestEP->RegisterMsgHandler(msgHandler);
  requestEP->LoopRun();
  processTd->join();
  owdTd->join();

  delete requestEP;
  delete processTd;
  delete owdTd;
}

================================================
FILE: micro-bench/bench_sender.cc
================================================
#include <fstream>
#include <iostream>
#include "lib/utils.h"
#include "lib/zipfian.h"
#include "proto/nezha_proto.pb.h"
DEFINE_string(receiver_1_ip, "127.0.0.1", "The ip address of the 1st receiver");
DEFINE_string(receiver_2_ip, "127.0.0.1", "The ip address of the 2nd receiver");
DEFINE_string(receiver_3_ip, "127.0.0.1", "The ip address of the 3rd receiver");
DEFINE_string(receiver_4_ip, "127.0.0.1", "The ip address of the 4th receiver");
DEFINE_string(receiver_5_ip, "127.0.0.1", "The ip address of the 5th receiver");

DEFINE_int32(receiver_1_port, 33333, "The port of the 1st receiver");
DEFINE_int32(receiver_2_port, 33333, "The port of the 2nd receiver");
DEFINE_int32(receiver_3_port, 33333, "The port of the 3rd receiver");
DEFINE_int32(receiver_4_port, 33333, "The port of the 4th receiver");
DEFINE_int32(receiver_5_port, 33333, "The port of the 5th receiver");

DEFINE_int32(receiver_num, 2, "The number of receivers to test");

DEFINE_string(client_ip, "127.0.0.1", "The ip address of the client");

DEFINE_int32(client_port, 33336,
             "The port of the client listens for OWD reply");

DEFINE_uint64(poisson_rate, 10000, "Request Per Second");

DEFINE_uint64(duration, 60, "Duration of the experiment");

DEFINE_uint64(client_id, 1, "Client ID");

std::vector<uint32_t> latencyBounds;
std::atomic<uint32_t> bound;

void ReplyHandlerFunc(MessageHeader* msgHeader, char* msgBuffer,
                      Address* sender, void* context) {
  if (msgHeader->msgType == MessageType::FAST_REPLY && msgHeader->msgLen > 0) {
    nezha::proto::Reply reply;
    if (reply.ParseFromArray(msgBuffer, msgHeader->msgLen)) {
      // LOG(INFO) << "replyOWD " << reply.owd() << "\t" << reply.replicaid();
      if (reply.owd() > 0 && reply.owd() < 200) {
        latencyBounds[reply.replicaid()] = reply.owd();
        auto it =
            max_element(std::begin(latencyBounds), std::end(latencyBounds));

        if (*it != bound) {
          bound.store(*it);
        }
      }
    }
  }
}

void OWDUpdate() {
  latencyBounds.resize(FLAGS_receiver_num, 80);
  bound = 80;
  UDPSocketEndpoint* replyEP = dynamic_cast<UDPSocketEndpoint*>(CreateEndpoint(
      EndpointType::UDP_ENDPOINT, FLAGS_client_ip, FLAGS_client_port));
  UDPMsgHandler* msgHandler = new UDPMsgHandler(ReplyHandlerFunc);
  replyEP->RegisterMsgHandler(msgHandler);
  replyEP->LoopRun();
}

int main(int argc, char* argv[]) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);
  google::InitGoogleLogging(argv[0]);
  FLAGS_logtostderr = 1;
  Endpoint* requestEP =
      CreateEndpoint(EndpointType::UDP_ENDPOINT, "", -1, true);
  LOG(INFO) << "ClientId = " << FLAGS_client_id << "\t"
            << " rate=" << FLAGS_poisson_rate;
  std::vector<uint32_t> reqPer10msVec;
  reqPer10msVec.reserve(FLAGS_duration * 100);
  std::default_random_engine generator(
      FLAGS_client_id);  // clientId as the seed
  std::poisson_distribution<int> distribution(FLAGS_poisson_rate / 100);
  for (uint32_t i = 0; i < FLAGS_duration * 100; i++) {
    reqPer10msVec.push_back(distribution(generator));
  }
  uint32_t maxReqId = FLAGS_poisson_rate * (FLAGS_duration - 10);
  std::thread* replyTd = new std::thread(OWDUpdate);
  uint32_t reqCnt = 0;
  std::vector<Address*> receiverAddrs;
  receiverAddrs.resize(5, NULL);
  receiverAddrs[0] = new Address(FLAGS_receiver_1_ip, FLAGS_receiver_1_port);
  receiverAddrs[1] = new Address(FLAGS_receiver_2_ip, FLAGS_receiver_2_port);
  receiverAddrs[2] = new Address(FLAGS_receiver_3_ip, FLAGS_receiver_3_port);
  receiverAddrs[3] = new Address(FLAGS_receiver_4_ip, FLAGS_receiver_4_port);
  receiverAddrs[4] = new Address(FLAGS_receiver_5_ip, FLAGS_receiver_5_port);
  nezha::proto::Request request;
  request.set_clientid(FLAGS_client_id);
  srand(FLAGS_client_id);
  for (uint32_t i = 0; i < reqPer10msVec.size(); i++) {
    uint32_t reqNum = reqPer10msVec[i];
    if (reqNum <= 0) {
      usleep(10000);
    } else {
      uint32_t intval = 10000 / reqNum;
      uint64_t nowTime = GetMicrosecondTimestamp();
      for (uint32_t j = 1; j <= reqNum; j++) {
        while (GetMicrosecondTimestamp() < nowTime + intval * j) {
        }
        uint64_t sendTime = GetMicrosecondTimestamp();
        request.set_sendtime(sendTime);
        request.set_bound(bound);
        request.set_reqid(reqCnt + 1);
        for (int k = 0; k < FLAGS_receiver_num; k++) {
          requestEP->SendMsgTo(*(receiverAddrs[k]), request,
                               MessageType::CLIENT_REQUEST);
        }

        reqCnt++;
        if (reqCnt >= maxReqId) {
          LOG(INFO) << "reqCnt=" << reqCnt << "\tTerminate Here";
          exit(0);
        }
      }
    }
  }
  delete requestEP;
  replyTd->join();
  delete replyTd;
}

================================================
FILE: micro-bench/launch_micro.py
================================================
import os
import subprocess
from subprocess import PIPE, Popen
import time
import ruamel.yaml
from termcolor import colored
import argparse


LOGIN_PATH = "/home/steam1994"
TAG = "opensource-test"
SSH_KEY = "/home/steam1994/.ssh/id_rsa"
ssh_identity = '-i {}'.format(SSH_KEY) if SSH_KEY else ''
# Prefix for SSH and SCP.
SSH = 'ssh {} -q -o ConnectTimeout=2 -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no '.format(
    ssh_identity)
SCP = 'scp -r {} -q -o ConnectTimeout=2 -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no '.format(
    ssh_identity)
USERNAME = "steam1994"
CMD_RETRY_TIMES = 3


def generate_ttcs_cfg_file(internal_ip, is_reference=False, use_ntp=False):
    if is_reference:
        content_str = '''management_address: "InternalIP"
log_dir: "/var/opt/ttcs/log"
subscription_mode: true
coordinator_address: "c-gjk1994gjk1994-c89e.gcp.clockwork.io"
coordinator_subscription_service_port: 6176
probe_address: "InternalIP"
clock_quality: 10
correct_clock: false'''
        cfg_file = content_str.replace("InternalIP", internal_ip)
        cfg_file_name = "ttcs-agent.cfg"
        with open(cfg_file_name, "w") as f:
            f.write(cfg_file)
        f.close()
        return cfg_file_name
    else:
        if use_ntp:
            content_str = '''management_address: "InternalIP"
log_dir: "/var/opt/ttcs/log"
subscription_mode: true
coordinator_address: "c-gjk1994gjk1994-c89e.gcp.clockwork.io"
coordinator_subscription_service_port: 6176
probe_address: "InternalIP"
clock_quality: 1
correct_clock: false'''
        else:
            content_str = '''management_address: "InternalIP"
log_dir: "/var/opt/ttcs/log"
subscription_mode: true
coordinator_address: "c-gjk1994gjk1994-c89e.gcp.clockwork.io"
coordinator_subscription_service_port: 6176
probe_address: "InternalIP"
clock_quality: 1
correct_clock: true'''
        cfg_file = content_str.replace("InternalIP", internal_ip)
        cfg_file_name = "ttcs-agent.cfg"
        with open(cfg_file_name, "w") as f:
            f.write(cfg_file)
        f.close()
        return cfg_file_name


def retry_proc_error(procs_list):
    procs_error = []
    for server, proc, cmd in procs_list:
        output, err = proc.communicate()
        if proc.returncode != 0:
            proc = Popen(cmd.split(), stdout=PIPE, stderr=PIPE)
            procs_error.append((server, proc, cmd))
    return procs_error


def start_ttcs_node(internal_ip, is_reference, use_ntp=False):
    clean_prev_deb_cmd = "sudo dpkg -P ttcs-agent"
    run_command([internal_ip], clean_prev_deb_cmd, in_background=False)
    install_deb_cmd = "sudo dpkg -i /home/steam1994/ttcs-agent_1.0.21_amd64.deb"
    #install_deb_cmd = "sudo dpkg -i /root/ttcs-agent_1.0.12_amd64.deb"
    run_command([internal_ip], install_deb_cmd, in_background=False)

    cfg_file = generate_ttcs_cfg_file(internal_ip, is_reference, use_ntp)
    local_file_path = "./ttcs-agent.cfg"
    remote_dir = "/etc/opt/ttcs"
    remote_path = remote_dir + "/ttcs-agent.cfg"

    chmod_cmd = "sudo chmod -R 777 {remote_dir}".format(remote_dir=remote_dir)
    run_command([internal_ip], chmod_cmd, in_background=False)

    rm_cmd = "sudo rm -f {remote_path}".format(remote_path=remote_path)
    run_command([internal_ip], rm_cmd, in_background=False)

    scp_files([internal_ip], local_file_path, remote_path, to_remote=True)

    if is_reference is not True and use_ntp is False:
        stop_ntp_cmd = "sudo systemctl stop ntp"
        run_command([internal_ip], stop_ntp_cmd, in_background=False)
        disable_ntp_cmd = "sudo systemctl disable ntp"
        run_command([internal_ip], disable_ntp_cmd, in_background=False)
        stop_ntp_cmd = "sudo systemctl stop chronyd"
        run_command([internal_ip], stop_ntp_cmd, in_background=False)
        disable_ntp_cmd = "sudo systemctl disable chronyd"
        run_command([internal_ip], disable_ntp_cmd, in_background=False)
    else:
        enable_ntp_cmd = "sudo systemctl enable chronyd"
        run_command([internal_ip], enable_ntp_cmd, in_background=False)
        start_ntp_cmd = "sudo systemctl start chronyd"
        run_command([internal_ip], start_ntp_cmd, in_background=False)

    sys_start_ttcp_agent_cmd = "sudo systemctl start ttcs-agent"
    run_command([internal_ip], sys_start_ttcp_agent_cmd, in_background=False)


def launch_ttcs(server_ip_list):
    stop_ntp_cmd = "sudo systemctl stop chronyd"
    run_command(server_ip_list, stop_ntp_cmd, in_background=False)
    disable_ntp_cmd = "sudo systemctl disable chronyd"
    run_command(server_ip_list, disable_ntp_cmd, in_background=False)
    stop_ntp_cmd = "sudo systemctl stop ntp"
    run_command(server_ip_list, stop_ntp_cmd, in_background=False)
    disable_ntp_cmd = "sudo systemctl disable ntp"
    run_command(server_ip_list, disable_ntp_cmd, in_background=False)
    sys_start_ttcp_agent_cmd = "sudo systemctl start ttcs-agent"
    run_command(server_ip_list, sys_start_ttcp_agent_cmd, in_background=False)


def scp_files(server_ip_list, local_path_to_file, remote_dir, to_remote):
    '''
    copies the file in 'local_path_to_file' to the 'remote_dir' in all servers
    whose external ip addresses are in 'server_ip_list'

    args
        server_ip_list: list of external IP addresses to communicate with
        local_path_to_file: e.g. ./script.py
        remote_dir: e.g. ~
        to_remote: whether to copy to remote (true) or vice versa (false)
    returns
        boolean whether operation was succesful on all servers or not
    '''
    src = remote_dir if not to_remote else local_path_to_file
    src_loc = 'remote' if not to_remote else 'local'
    dst = remote_dir if to_remote else local_path_to_file
    dst_loc = 'remote' if to_remote else 'local'

    message = 'from ({src_loc}) {src} to ({dst_loc}) {dst}'.format(
        src_loc=src_loc, src=src, dst_loc=dst_loc, dst=dst)
    print('---- started scp {}'.format(message))

    procs = []
    for server in server_ip_list:
        if to_remote:
            cmd = '{} {} {}@{}:{}'.format(SCP, local_path_to_file,
                                          USERNAME, server, remote_dir)
            proc = Popen(cmd.split(), stdout=PIPE, stderr=PIPE)
        else:
            cmd = '{} {}@{}:{} {}'.format(SCP, USERNAME, server,
                                          remote_dir, local_path_to_file)
            proc = Popen(cmd.split(), stdout=PIPE, stderr=PIPE)
        # print("scp cmd ", cmd)
        procs.append((server, proc, cmd))

    success = True
    procs_error = retry_proc_error(procs)
    retries = 1
    while retries < CMD_RETRY_TIMES and procs_error:
        procs_error = retry_proc_error(procs)
        retries += 1

    if retries >= CMD_RETRY_TIMES and procs_error:
        success = False
        for server, proc, cmd in procs_error:
            output, err = proc.communicate()
            if proc.returncode != 0:
                print(
                    colored('[{}]: FAIL SCP - [{}]'.format(server, cmd),
                            'yellow'))
                print(colored('Error Response:', 'blue', attrs=['bold']),
                      proc.returncode, output, err)

    if success:
        print(
            colored('---- SUCCESS SCP {} on {}'.format(message,
                                                       str(server_ip_list)),
                    'green',
                    attrs=['bold']))
    else:
        print(
            colored('---- FAIL SCP {}'.format(message), 'red', attrs=['bold']))
    return success


def run_command(server_ip_list, cmd, in_background=True):
    '''
    runs the command 'cmd' in all servers whose external ip addresses are 
    in 'server_ip_list'

    cfg
        server_ip_list: list of external IP addresses to communicate with
        cmd: command to run
    returns
        boolean whether operation was succesful on all servers or not
    '''
    if not in_background:
        print('---- started to run command - [{}] on {}'.format(
            cmd, str(server_ip_list)))
    else:
        print(
            colored('---- started to run [IN BACKGROUND] command - [{}] on {}'.
                    format(cmd, str(server_ip_list)),
                    'blue',
                    attrs=['bold']))
    procs = []
    for server in server_ip_list:
        ssh_cmd = '{} {}@{} {}'.format(SSH, USERNAME, server, cmd)
        proc = Popen(ssh_cmd.split(), stdout=PIPE, stderr=PIPE)
        procs.append((server, proc, ssh_cmd))

    success = True
    output = ''
    if not in_background:
        procs_error = retry_proc_error(procs)
        retries = 1
        while retries < CMD_RETRY_TIMES and procs_error:
            procs_error = retry_proc_error(procs)
            retries += 1

        if retries >= CMD_RETRY_TIMES and procs_error:
            success = False
            for server, proc, cmd in procs_error:
                output, err = proc.communicate()
                if proc.returncode != 0:
                    print(
                        colored(
                            '[{}]: FAIL run command - [{}]'.format(
                                server, cmd), 'yellow'))
                    print(colored('Error Response:', 'blue', attrs=['bold']),
                          proc.returncode, output, err)

        if success:
            print(
                colored('---- SUCCESS run command - [{}] on {}'.format(
                    cmd, str(server_ip_list)),
                        'green',
                        attrs=['bold']))
        else:
            print(
                colored('---- FAIL run command - [{}]'.format(cmd),
                        'red',
                        attrs=['bold']))

    return success, output


def create_instance(instance_name,
                    image=None,
                    machine_type = "n1-standard-4",
                    customzedZone = "us-central1-a",
                    customzedIp = None,
                    require_external_ip=False,
                    second_ip = False
                    ):
    # Construct gcloud command to create instance.
    

    network_address_config = ("--network-interface no-address"
                              if require_external_ip == False else "")
    
    if customzedIp is not None:
        network_address_config += ",private-network-ip="+customzedIp
        
    if second_ip:
        network_address_config += " --network-interface subnet=subnet-1,no-address"
    # scopes = "--scopes storage-full,https://www.googleapis.com/auth/bigtable.admin,https://www.googleapis.com/auth/bigtable.data,https://www.googleapis.com/auth/bigquery"
    # if full_access_to_cloud_apis:
    scopes = "--scopes=https://www.googleapis.com/auth/cloud-platform"

    create_instance_cmd = """gcloud beta compute instances create {inst} --zone {zone} --image-family {source_image} --machine-type {machine_type} {network} {scopes} --boot-disk-size 50GB""".format(
        inst=instance_name,
        zone=customzedZone,
        source_image=image,
        machine_type=machine_type,
        network=network_address_config,
        scopes=scopes,
    )

    # print(create_instance_cmd)
    # Run gcloud command to create machine.
    proc = Popen(create_instance_cmd, stdout=PIPE, stderr=PIPE, shell=True)
    # Wait for the process end and print error in case of failure
    output, error = proc.communicate()
    if proc.returncode != 0:
        print(colored("Failed to create instance", color="red",
                      attrs=["bold"]))
        print(colored("Error Response: ", color="blue", attrs=["bold"]),
              output, error)


def del_instance_list(instance_list, zone="us-central1-a"):
    for machine in instance_list:
        print(colored("Deleting "+machine, "red", attrs=['bold']))
        subprocess.Popen(
            'gcloud -q compute instances delete {inst} --zone {zone}'.format(
                inst=machine, zone=zone).split())

def stop_instance_list(instance_list, zone="us-central1-a"):
    stop_cmd = 'gcloud compute instances stop {inst} --zone {zone}'.format(
            inst=' '.join(instance_list), zone = zone
            )
    print(stop_cmd)
    os.system(stop_cmd)


def start_instance_list(instance_list, zone="us-central1-a"):
    start_cmd = 'gcloud compute instances start {inst} --zone {zone}'.format(
            inst=' '.join(instance_list), zone = zone
            )
    print(start_cmd)
    os.system(start_cmd)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('--num_replicas',  type=int, default = 3,
                        help='Specify the number of replicas ')
    parser.add_argument('--num_proxies',  type=int, default = 2,
                        help='Specify the number of proxies ')
    parser.add_argument('--num_clients',  type=int, default = 10,
                        help='Specify the number of clients ')
    args = parser.parse_args()

    num_replicas = args.num_replicas
    num_proxies = args.num_proxies
    num_clients = args.num_clients
    print("replicas: ", num_replicas)
    print("proxies: ", num_proxies)
    print("clients: ", num_clients)

    
    # cfg_file_name = generate_ttcs_cfg_file("10.128.3.79", is_reference=True, use_ntp=False)
    
    replica_ips = ["10.128.2."+str(i+10) for i in range(10)]
    proxy_ips = ["10.128.2."+str(i+20) for i in range(10) ]
    client_ips = ["10.128.2."+str(i+30) for i in range(100) ]

    replica_ips = replica_ips[0:num_replicas]
    proxy_ips = proxy_ips[0:num_proxies]
    client_ips = client_ips[0:num_clients]

    replica_name_list = [TAG+"-replica-"+str(i) for i in range(num_replicas) ]
    proxy_name_list = [ TAG+"-proxy-"+str(i) for i in range(num_proxies) ]
    client_name_list = [ TAG+"-client-"+str(i) for i in range(num_clients) ]

    vm_ips = replica_ips + proxy_ips + client_ips
    vm_name_list = replica_name_list + proxy_name_list + client_name_list

    replica_vm_type = "n1-standard-16"
    proxy_vm_type = "n1-standard-32"
    client_vm_type = "n1-standard-4"

    binary_path = "{login_path}/nezhav2/bazel-bin/".format(login_path = LOGIN_PATH)

    config_path = "{login_path}/nezhav2/configs".format(login_path = LOGIN_PATH)

    yaml = ruamel.yaml.YAML()


    # for i in range(num_replicas):
    #     create_instance(instance_name = replica_name_list[i],
    #                     image= "opensource-nezha",
    #                     machine_type =  replica_vm_type,
    #                     customzedZone="us-central1-a",
    #                     customzedIp = replica_ips[i] )
    #     print(colored("Created "+replica_name_list[i], "green", attrs=['bold']))
        
    # exit(0)

    # for i in range(num_proxies):
    #     create_instance(instance_name = proxy_name_list[i],
    #                     image= "opensource-nezha",
    #                     machine_type =  proxy_vm_type,
    #                     customzedZone="us-central1-a",
    #                     customzedIp = proxy_ips[i] )
    #     print(colored("Created "+proxy_name_list[i], "green", attrs=['bold']))
        

    # for i in range(num_clients):
    #     create_instance(instance_name = client_name_list[i],
    #                     image= "opensource-nezha",
    #                     machine_type =  client_vm_type,
    #                     customzedZone="us-central1-a",
    #                     customzedIp = client_ips[i] )
    #     print(colored("Created "+client_name_list[i], "green", attrs=['bold']))


    # time.sleep(120)
    # for i in range(len(vm_ips)):
    #     start_ttcs_node(vm_ips[i],False)
    # exit(0)

    #### del_instance_list(instance_list=vm_name_list)


    # stop_instance_list(instance_list = vm_name_list)
    # exit(0)


    # start_instance_list(instance_list = vm_name_list)
    # time.sleep(60)
    # print(vm_ips)
    # launch_ttcs(vm_ips)
    # exit(0)

    # start_ttcs_node(replica_ips[3],False)
    # exit(0)

    test_no = 1
    enable_dom =1
    # enable_dom = 1
    #poisson_rate = 10000
    poisson_rate = 5000
    percentile = 50
    while len(replica_ips) < 5:
        replica_ips += ["127.0.0.1"]
    print(replica_ips)
    for test_no in range(1,6):
        for percentile in [50]: #[50,75,90,95]:
            remote_path = "{login_path}/nezhav2/bazel-bin/*".format(login_path = LOGIN_PATH)
            rm_cmd = "sudo rm -rf {remote_path}".format(remote_path=remote_path)
            run_command(vm_ips, rm_cmd, in_background=False)

            mkdir_cmd = "mkdir -p {binary_path}/micro-bench".format(binary_path = binary_path)
            run_command(vm_ips, mkdir_cmd, in_background=False)

            
            binary_file = "{binary_path}/micro-bench/bench_sender".format(binary_path=binary_path)
            scp_files(vm_ips, binary_file, binary_file, to_remote = True)

            binary_file = "{binary_path}/micro-bench/bench_receiver".format(binary_path=binary_path)
            scp_files(vm_ips, binary_file, binary_file, to_remote = True)

            # Kill existing procs
            kill_cmd = "sudo pkill -9 bench_receiver"
            run_command(vm_ips, kill_cmd, in_background=False)
            kill_cmd = "sudo pkill -9 bench_sender"
            run_command(vm_ips, kill_cmd, in_background=False)

            rm_cmd = "sudo rm -rf Replica-Stats*.csv"
            run_command(vm_ips, rm_cmd, in_background=False)


            ## Launch replicas (id starts from 0)
            for i in range(num_replicas):
                replica_cmd = "{binary_path}/micro-bench/bench_receiver --receiver_ip {ip} --replica_id {id} --enable_dom {enable_dom} --percentile {percentile} >{log_file} 2>&1 &".format(
                binary_path = binary_path,
                ip = replica_ips[i],
                id = i,
                enable_dom = enable_dom,
                percentile = percentile,
                log_file = "receiver-log-"+str(i)
                )
                print(colored(replica_cmd, "yellow", attrs=['bold']))
                run_command([replica_ips[i]], replica_cmd, in_background=False)


            # Launch clients (id starts from 2)
            for i in range(num_clients):
                client_cmd = "{binary_path}/micro-bench/bench_sender --receiver_1_ip {ip1} --receiver_2_ip {ip2} --receiver_3_ip {ip3} --receiver_4_ip {ip4} --receiver_5_ip {ip5} --receiver_num {receiver_num} --client_ip {myip} --poisson_rate {poisson_rate} --client_id {id}   >{log_file} 2>&1 &".format(
                    binary_path = binary_path,
                    ip1 = replica_ips[0], 
                    ip2 = replica_ips[1],
                    ip3 = replica_ips[2], 
                    ip4 = replica_ips[3],
                    ip5 = replica_ips[4], 
                    receiver_num = num_replicas,
                    myip = client_ips[i],
                    poisson_rate = poisson_rate,
                    id =  i+1,
                    log_file = "client-log-"+str(i+1) 
                ) 
                print(colored(client_cmd, "yellow", attrs=['bold']))
                run_command([client_ips[i]], client_cmd, in_background = True)
                
            # exit(0)
            print("Sleep...")
            time.sleep(90)

            # Copy Stats File
            folder_name = "micro-stats"
            sub_folder_name = "T-{test_no}-{num_replicas}-{num_clients}-{poisson_rate}-{enable_dom}-{percentile}".format(
                test_no = test_no,
                num_replicas  = num_replicas,
                num_clients = num_clients, 
                poisson_rate = poisson_rate, 
                enable_dom = enable_dom,
                percentile = percentile
            )
            stats_folder = "{login_path}/{folder_name}/{sub_folder_name}".format(
                login_path = LOGIN_PATH,
                folder_name = folder_name,
                sub_folder_name = sub_folder_name
            )
            mkdir_cmd = "sudo mkdir -p -m 777 {stats_folder}".format(stats_folder = stats_folder)
            os.system(mkdir_cmd)


            for i in range(num_replicas):
                file_name = "Replica-Stats-"+str(i)+".csv"
                local_file_path = "{stats_folder}/{file_name}".format(
                    stats_folder = stats_folder,
                    file_name = file_name
                )
                remote_path = "{stats_folder}/{file_name}".format(
                    stats_folder = LOGIN_PATH,
                    file_name = file_name
                )
                scp_files([replica_ips[i]], local_file_path, remote_path, to_remote=False)

================================================
FILE: proto/BUILD
================================================
load("@rules_proto//proto:defs.bzl", "proto_library")


proto_library(
    name = "nezha_proto",
    srcs = ["nezha_proto.proto"],
    visibility = ["//visibility:public"],
)

cc_proto_library(
    name = "nezha_cc_proto",
    deps = [":nezha_proto"],
    visibility = ["//visibility:public"],
)


================================================
FILE: proto/nezha_proto.proto
================================================
syntax = "proto3";
package nezha.proto;


message Request {
    uint64 sendtime = 1;
    uint32 bound=2;
    uint32 clientid = 3;
    uint32 reqid = 4;
    bytes command=5;
    uint64 proxyid = 6;
    uint32 key = 7;
    bool iswrite = 8;
}

message RequestBodyMsg {
    uint64 deadline = 1;
    uint64 reqkey = 2;
    uint64 proxyid = 3;
    bytes command = 4;
    uint32 key = 5;
    bool iswrite = 6;
}
message TimeStats {
    uint64 clienttime = 1;
    uint64 proxytime = 2;
    uint64 recvtime =3;
    uint64 fastreplytime = 4;
    uint64 slowreplytime= 5;
    uint64 deadline = 6;
}
message Reply {
    uint32 clientid = 1;
    uint32 reqid = 2;
    uint32 view = 3;
    uint32 replicaid = 4;
    bytes hash = 5;
    bytes result = 6;
    uint32 replytype = 7;
    uint32 owd = 8;
    uint32 maxsyncedlogid = 9; // This is the largest syncedlogid of my synced logs
    uint32 logid = 10; // only set by the leader, it is the log id of the entry replied
    bool iswrite = 11;
    uint32 opkey = 12;
}

message IndexSync {
    uint32 logidbegin = 1;
    uint32 logidend = 2;
    repeated uint64 deadlines = 3;
    repeated uint64 reqkeys =4;
    uint32 view = 5;
    uint64 sendtime = 6;
}

message AskIndex {
    uint32 logidbegin = 1;
    uint32 logidend = 2;
    uint32 replicaid = 3;
}

message AskReq {
    repeated uint64 missedreqkeys = 1;
    uint32 replicaid = 2;
}

message MissedReq {
    repeated RequestBodyMsg reqs = 1;
    uint32 replicaid = 2;
}

message ViewChangeRequest{
    uint32 view = 1;
    uint32 replicaid = 2;
    repeated uint32 cv = 3;
}

message ViewChange {
    uint32 view = 1;
    uint32 replicaid = 2;
    repeated uint32 cv = 3;
    uint32 lastnormalview= 4;
    // In the algo, we should include the logs in the viewchange msg
    // But that is too large. As an implementation optimization, Let's use the following information, and later do state transfer to get the necessary entries
    uint32 syncpoint = 5; // for synced logs: the max synced log id, no need to add syncbegin, because it is always CONCURRENT_MAP_START_IDX
    uint32 unsynclogbegin = 6;
    uint32 unsynclogend = 7; 
}


message StateTransferRequest {
    uint32 view = 1;
    uint32 replicaid = 2;
    bool issynced = 3;
    uint32 logbegin = 4;
    uint32 logend = 5;
}


message StateTransferReply {
    uint32 view = 1;
    uint32 replicaid = 2;
    repeated uint32 cv = 3;
    bool issynced = 4;
    uint32 logbegin = 5;
    uint32 logend = 6;
    repeated RequestBodyMsg reqs = 7;
}


message StartView {
    uint32 view = 1;
    uint32 replicaid = 2;
    repeated uint32 cv = 3;
    uint32 syncedlogid = 4;
}


message CrashVectorRequest {
    bytes nonce = 1;
    uint32 replicaid = 2;
}

message CrashVectorReply {
    bytes nonce = 1;
    uint32 replicaid = 2;
    repeated uint32 cv = 3;
}

message RecoveryRequest {
    repeated uint32 cv = 1;
    uint32 replicaid = 2;
}

message RecoveryReply {
    uint32 view = 1;
    repeated uint32 cv = 2;
    uint32 replicaid = 3;
    uint32 syncedlogid = 4;
}

message SyncStatusReport {
    uint32 view = 1;
    repeated uint32 cv = 2;
    uint32 replicaid = 3;
    uint32 syncedlogid = 4;
}

message CommitInstruction {
    uint32 view = 1;
    repeated uint32 cv = 2;
    uint32 replicaid = 3;
    uint32 committedlogid = 4;
}


================================================
FILE: proxy/BUILD
================================================
cc_library(
    name = "proxy_config",
    hdrs = ["proxy_config.h"],
    deps = [
        "@com_github_jbeder_yaml_cpp//:yaml-cpp",
    ],
)

cc_library(
    name = "proxy_class",
    srcs = ["proxy.cc"],
    hdrs = ["proxy.h"],
    deps = [
        "//proto:nezha_cc_proto",
        "//lib:utils",
        "//lib:address",
        ":proxy_config",
    ],
)

cc_binary(
    name = "nezha_proxy",
    srcs = ["proxy_run.cc"],
    deps = [
        ":proxy_class",
    ],
)


================================================
FILE: proxy/proxy.cc
================================================
#include "proxy/proxy.h"

namespace nezha {
Proxy::Proxy(const std::string& configFile) {
  std::string error = proxyConfig_.parseConfig(configFile);
  if (error != "") {
    LOG(ERROR) << "Error parsing proxy config: " << error << "Exiting.";
    exit(1);
  }
  CreateContext();
}

void Proxy::Terminate() {
  LOG(INFO) << "Terminating...";
  running_ = false;
}

void Proxy::Run() {
  running_ = true;
  LaunchThreads();
  for (auto& kv : threadPool_) {
    LOG(INFO) << "Join " << kv.first;
    kv.second->join();
    LOG(INFO) << "Join Complete " << kv.first;
  }
  LOG(INFO) << "Run Terminated ";
}

Proxy::~Proxy() {
  for (auto& kv : threadPool_) {
    delete kv.second;
  }

  for (uint32_t i = 0; i < replicaAddrs_.size(); i++) {
    for (uint32_t j = 0; j < replicaAddrs_[0].size(); j++) {
      if (replicaAddrs_[i][j]) {
        delete replicaAddrs_[i][j];
      }
    }
  }

  // Clear Context (free memory)
  ConcurrentMap<uint32_t, struct sockaddr_in*>::Iterator clientIter(
      clientAddrs_);
  while (clientIter.isValid()) {
    if (clientIter.getValue()) {
      delete clientIter.getValue();
    }
    clientIter.next();
  }

  // for (uint32_t i = 0; i < committedReplyMap_.size(); i++) {
  //   ConcurrentMap<uint64_t, Reply*>& committedReply = committedReplyMap_[i];
  //   ConcurrentMap<uint64_t, Reply*>::Iterator iter(committedReply);
  //   while (iter.isValid()) {
  //     Reply* reply = iter.getValue();
  //     if (reply) {
  //       delete reply;
  //     }
  //     iter.next();
  //   }
  // }
}

int Proxy::CreateSocketFd(const std::string& sip, const int sport) {
  int fd = socket(PF_INET, SOCK_DGRAM, 0);
  if (fd < 0) {
    LOG(ERROR) << "Receiver Fd fail ";
    return -1;
  }
  // Set Non-Blocking
  int status = fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_NONBLOCK);
  if (status < 0) {
    LOG(ERROR) << " Set NonBlocking Fail";
    return -1;
  }

  if (sip != "") {
    struct sockaddr_in addr;
    bzero(&addr, sizeof(addr));
    addr.sin_family = AF_INET;
    addr.sin_port = htons(sport);
    addr.sin_addr.s_addr = inet_addr(sip.c_str());
    // Bind socket to Address
    int bindRet = bind(fd, (struct sockaddr*)&addr, sizeof(addr));
    if (bindRet != 0) {
      LOG(ERROR) << "bind error\t" << bindRet;
      return -1;
    }
  }
  return fd;
}

void Proxy::LaunchThreads() {
  int shardNum = proxyConfig_.proxyShardNum;

  threadPool_["CalcLatencyBound"] =
      new std::thread(&Proxy::CalculateLatencyBoundTd, this);
  for (int i = 0; i < shardNum; i++) {
    std::string key = "CheckQuorumTd-" + std::to_string(i);
    threadPool_[key] = new std::thread(&Proxy::CheckQuorumTd, this, i);
  }

  for (int i = 0; i < shardNum; i++) {
    std::string key = "ForwardRequestsTd-" + std::to_string(i);
    threadPool_[key] = new std::thread(&Proxy::ForwardRequestsTd, this, i);
  }

  // std::string key = "LogTd";
  // threadPool_[key] = new std::thread(&Proxy::LogTd, this);
}

void Proxy::CalculateLatencyBoundTd() {
  std::pair<uint32_t, uint32_t> owdSample;
  std::vector<uint32_t> replicaOWDs;
  replicaOWDs.resize(proxyConfig_.replicaIps.size(),
                     proxyConfig_.replicaInitialOwd);
  for (uint32_t i = 0; i < replicaOWDs.size(); i++) {
    LOG(INFO) << "replicaOWD " << i << "\t" << replicaOWDs[i];
  }
  while (running_) {
    while (owdQu_.try_dequeue(owdSample)) {
      VLOG(1) << "replica=" << owdSample.first << "\towd=" << owdSample.second;
      replicaOWDs[owdSample.first] = owdSample.second;
      // Update latency bound
      uint32_t estimatedOWD = 0;
      for (uint32_t i = 0; i < replicaOWDs.size(); i++) {
        if (estimatedOWD < replicaOWDs[i]) {
          estimatedOWD = replicaOWDs[i];
        }
      }
      if (estimatedOWD > maxOWD_) {
        estimatedOWD = maxOWD_;
      }
      latencyBound_.store(estimatedOWD);
      VLOG(1) << "Update bound " << latencyBound_;
    }
    usleep(5000);
  }
}

void Proxy::LogTd() {
  Log litem;
  std::ofstream ofs("Proxy-Stats-" + std::to_string(proxyConfig_.proxyId) +
                    ".csv");
  ofs << "ReplicaId,ClientId,RequestId,ClientTime,ProxyTime,"
         "ProxyEndProcessTime,RecvTime,Deadline,"
         "FastReplyTime,"
         "SlowReplyTime,"
         "ProxyRecvTime,CommitType"
      << std::endl;
  uint32_t logCnt = 0;
  while (running_) {
    if (logQu_.try_dequeue(litem)) {
      ofs << litem.ToString() << std::endl;
      logCnt++;
      if (logCnt % 10000 == 0) {
        ofs.flush();
      }
    }
  }
}

void Proxy::CheckQuorumTd(const int id) {
  // ConcurrentMap<uint64_t, Reply*>& committedReply = committedReplyMap_[id];
  std::unordered_map<uint64_t, Reply*>& committedReply = committedReplyMap_[id];
  ConcurrentMap<uint64_t, Log*>& logs = logMap_[id];
  std::map<uint64_t, std::map<uint32_t, Reply>> replyQuorum;
  std::map<uint32_t, Reply*> uncommittedReply;  // Key: logId, value: reqKey
  uint32_t currentView = 0;
  int sz = 0;
  char buffer[UDP_BUFFER_SIZE];
  MessageHeader* msgHdr = (MessageHeader*)(void*)buffer;
  struct sockaddr_in recvAddr;
  socklen_t sockLen = sizeof(recvAddr);
  Reply reply;
  Reply* committedAck = NULL;
  uint32_t replyNum = 0;
  uint64_t startTime, endTime;
  std::vector<uint64_t>& replicaSyncedPoint = replicaSyncedPoints_[id];
  while (running_) {
    if ((sz = recvfrom(forwardFds_[id], buffer, UDP_BUFFER_SIZE, 0,
                       (struct sockaddr*)(&recvAddr), &sockLen)) > 0) {
      if ((uint32_t)sz < sizeof(MessageHeader) ||
          (uint32_t)sz < msgHdr->msgLen + sizeof(MessageHeader)) {
        continue;
      }

      if (reply.ParseFromArray(buffer + sizeof(MessageHeader),
                               msgHdr->msgLen)) {
        uint64_t reqKey = CONCAT_UINT32(reply.clientid(), reply.reqid());
        if (reply.owd() > 0) {
          owdQu_.enqueue(
              std::pair<uint32_t, uint32_t>(reply.replicaid(), reply.owd()));
        }

        uint64_t syncPoint =
            CONCAT_UINT32(reply.view(), reply.maxsyncedlogid());

        if (replicaSyncedPoint[reply.replicaid()] < syncPoint) {
          replicaSyncedPoint[reply.replicaid()] = syncPoint;
        }

        if (reply.clientid() == 0 && reply.reqid() == 0) {
          // Dummy reply, just used to update
          continue;
        }

        // committedAck = committedReply.get(reqKey);
        // if (committedAck != NULL) {
        //   // already committed;  ignore
        //   continue;
        // }
        auto iter = committedReply.find(reqKey);
        if (iter != committedReply.end()) {
          // already committed; ignore
          continue;
        }

        if (reply.view() < currentView) {
          LOG(INFO) << "Replied from old view";
          continue;
        }

        if (currentView < reply.view()) {
          // Replicas have upgraded to a new view
          // Reset current state
          currentView = reply.view();
          uncommittedReply.clear();
          replyQuorum.clear();
          for (int i = 0; i < replicaNum_; i++) {
            replicaSyncedPoint[i] = replicaSyncedPoint[reply.replicaid()];
          }
          currentView = reply.view();
        }

        // LOG(INFO) << reply.DebugString();
        if (reply.replytype() == (uint32_t)MessageType::COMMIT_REPLY) {
          committedAck = new Reply(reply);
          // committedReply.assign(reqKey, committedAck);
        } else if (replyQuorum[reqKey].find(reply.replicaid()) ==
                   replyQuorum[reqKey].end()) {
          replyQuorum[reqKey][reply.replicaid()] = reply;
          committedAck = isQuorumReady(replicaSyncedPoint, replyQuorum[reqKey]);
        } else if (reply.view() > replyQuorum[reqKey].begin()->second.view()) {
          // New view has come, clear existing replies for this request
          uncommittedReply.clear();
          replyQuorum[reqKey].clear();
          replyQuorum[reqKey][reply.replicaid()] = reply;
          for (int i = 0; i < replicaNum_; i++) {
            replicaSyncedPoint[i] = replicaSyncedPoint[reply.replicaid()];
          }
          committedAck = isQuorumReady(replicaSyncedPoint, replyQuorum[reqKey]);
        } else if (reply.view() == replyQuorum[reqKey].begin()->second.view()) {
          const Reply& existedReply = replyQuorum[reqKey][reply.replicaid()];
          if (existedReply.view() < reply.view()) {
            replyQuorum[reqKey][reply.replicaid()] = reply;
          } else if (existedReply.view() == reply.view() &&
                     existedReply.replytype() < reply.replytype()) {
            // FAST_REPLY < SLOW_REPLY < COMMIT_REPLY
            replyQuorum[reqKey][reply.replicaid()] = reply;
          }
          committedAck = isQuorumReady(replicaSyncedPoint, replyQuorum[reqKey]);
        }  // else: reply.view()< replyQuorum[reqKey].begin()->second.view(),
           // ignore it

        if (committedAck != NULL && committedAck->replytype() > 0) {
          // Ack to client
          struct sockaddr_in* clientAddr =
              clientAddrs_.get(committedAck->clientid());
          std::string replyMsg = committedAck->SerializeAsString();
          msgHdr->msgType = MessageType::COMMIT_REPLY;
          msgHdr->msgLen = replyMsg.length();
          memcpy(buffer + sizeof(MessageHeader), replyMsg.c_str(),
                 replyMsg.length());
          sendto(replyFds_[id], buffer,
                 replyMsg.length() + sizeof(MessageHeader), 0,
                 (struct sockaddr*)clientAddr, sizeof(sockaddr));

          // Add to cache
          // committedReply.assign(reqKey, committedAck);
          committedReply[reqKey] = committedAck;
          replyQuorum.erase(reqKey);

          // Disable Log
          // Log* litem = logs.get(reqKey);
          // if (litem) {
          //   litem->proxyRecvTime_ = GetMicrosecondTimestamp();
          //   litem->commitType_ = committedAck->replytype();
          //   logQu_.enqueue(*litem);
          // }

          // Check whether some uncommittedReply can be committed
          while ((!uncommittedReply.empty()) &&
                 uncommittedReply.begin()->first <= committedAck->logid()) {
            Reply* ack = uncommittedReply.begin()->second;
            ack->set_replytype(MessageType::COMMIT_REPLY);
            if (uncommittedReply.begin()->first < committedAck->logid()) {
              const Reply* ack = uncommittedReply.begin()->second;
              struct sockaddr_in* clientAddr =
                  clientAddrs_.get(ack->clientid());
              std::string replyMsg = ack->SerializeAsString();
              msgHdr->msgType = MessageType::COMMIT_REPLY;
              msgHdr->msgLen = replyMsg.length();
              memcpy(buffer + sizeof(MessageHeader), replyMsg.c_str(),
                     replyMsg.length());
              sendto(replyFds_[id], buffer,
                     replyMsg.length() + sizeof(MessageHeader), 0,
                     (struct sockaddr*)clientAddr, sizeof(sockaddr));
            }
            uint64_t reqKey = CONCAT_UINT32(ack->clientid(), ack->reqid());
            // committedReply.assign(reqKey, ack);
            committedReply[reqKey] = ack;
            replyQuorum.erase(reqKey);
            uncommittedReply.erase(uncommittedReply.begin());
            delete ack;
          }

          // LOG(INFO) << "reqId=" << committedAck->reqid()
          //           << "\t type=" << committedAck->replytype();
          // replyNum++;
          // if (replyNum == 1) {
          //   startTime = GetMicrosecondTimestamp();
          // } else if (replyNum % 100000 == 0) {
          //   endTime = GetMicrosecondTimestamp();
          //   float rate = 100000 / ((endTime - startTime) * 1e-6);
          //   LOG(INFO) << "id=" << id << "\t"
          //             << "replyNum=" << replyNum << "\t"
          //             << "rate = " << rate << "\t"
          //             << "uncommittedLen = " << uncommittedReply.size();
          //   startTime = endTime;
          // }
        } else if (committedAck != NULL && committedAck->replytype() == 0) {
          // record in uncommittedRequests
          if (committedAck->replicaid() == currentView % replicaNum_) {
            // This is a leader's reply, cache it
            if (uncommittedReply.find(committedAck->logid()) ==
                uncommittedReply.end()) {
              uncommittedReply[committedAck->logid()] = committedAck;
            }

          } else {
            delete committedAck;
          }
        }
      }
    }
  }
}

Reply* Proxy::isQuorumReady(std::vector<uint64_t>& replicaSyncedPoint,
                            std::map<uint32_t, Reply>& quorum) {
  // These replies are of the same view for sure (we have previously forbidden
  // inconsistency)
  uint32_t view = quorum.begin()->second.view();
  uint32_t leaderId = view % replicaNum_;
  if (quorum.find(leaderId) == quorum.end()) {
    return NULL;
  }

  Reply& leaderReply = quorum[leaderId];

  uint32_t fastOrSlowReplyNum = 0;  // slowReply can be used as fastReply
  uint32_t slowReplyNum = 0;        // But fastReply cannot be used as slowReply
  for (const auto& kv : quorum) {
    bool fastSatisfied = (kv.second.replytype() == MessageType::FAST_REPLY &&
                          kv.second.view() == leaderReply.view() &&
                          kv.second.hash() == leaderReply.hash());
    bool slowSatisfied =
        (HIGH_32BIT(replicaSyncedPoint[kv.first]) == leaderReply.view() &&
         LOW_32BIT(replicaSyncedPoint[kv.first]) >= leaderReply.logid());

    // if (kv.second.replytype() == MessageType::FAST_REPLY &&
    //     kv.second.hash() != leaderReply.hash()) {
    //   LOG(INFO) << kv.second.DebugString()
    //             << "\t\t\nLeader: " << leaderReply.DebugString();
    // }
    if (fastSatisfied || slowSatisfied) {
      fastOrSlowReplyNum++;
    }
    if (slowSatisfied) {
      slowReplyNum++;
    }
    // if( (!fastSatisfied) && (!slowSatisfied) && quorum.size()==3) {
    //   LOG(INFO) <<"Wrong "<<kv.second.DebugString();
    //   LOG(INFO) <<"leader "<<leaderReply.DebugString();
    //   LOG(INFO)
    //   <<"kv.first="<<kv.first<<"\t"<<replicaSyncedPoint[kv.first]<<"\t"
    //   <<(HIGH_32BIT(replicaSyncedPoint[kv.first]) ==
    //   leaderReply.view())<<"\t"
    //   <<(LOW_32BIT(replicaSyncedPoint[kv.first]) >= leaderReply.logid()) ;
    // }
  }

  Reply* committedReply = new Reply(leaderReply);
  if (fastOrSlowReplyNum >= (uint32_t)fastQuorum_) {
    // Fast Commit
    committedReply->set_replytype(MessageType::FAST_REPLY);
  } else if (slowReplyNum >= (uint32_t)f_ + 1) {
    // Slow Commit: Together with the leader reply, it forms the simple quorum
    // of f+1
    committedReply->set_replytype(MessageType::SLOW_REPLY);
  } else {
    // Uncommitted
    // if(quorum.size()==3) {

    //   LOG(INFO) <<"fastOrSlowReplyNum="<<fastOrSlowReplyNum;
    //   LOG(INFO) <<quorum[0].DebugString();
    //   LOG(INFO) <<quorum[1].DebugString();
    //   LOG(INFO) <<quorum[2].DebugString();
    // }
    committedReply->set_replytype(0);
  }
  return committedReply;
}

void Proxy::ForwardRequestsTd(const int id) {
  // ConcurrentMap<uint64_t, Reply*>& committedReply = committedReplyMap_[id];
  ConcurrentMap<uint64_t, Log*>& logs = logMap_[id];
  char buffer[UDP_BUFFER_SIZE];
  MessageHeader* msgHdr = (MessageHeader*)(void*)buffer;
  int sz = -1;
  struct sockaddr_in receiverAddr;
  socklen_t len = sizeof(receiverAddr);
  Request request;
  uint32_t forwardCnt = 0;
  uint64_t startTime, endTime;

  while (running_) {
    if ((sz = recvfrom(requestReceiveFds_[id], buffer, UDP_BUFFER_SIZE, 0,
                       (struct sockaddr*)&receiverAddr, &len)) > 0) {
      if ((uint32_t)sz < sizeof(MessageHeader) ||
          (uint32_t)sz < msgHdr->msgLen + sizeof(MessageHeader)) {
        continue;
      }
      if (msgHdr->msgType == MessageType::CLIENT_REQUEST &&
          request.ParseFromArray(buffer + sizeof(MessageHeader),
                                 msgHdr->msgLen)) {
        uint64_t reqKey = CONCAT_UINT32(request.clientid(), request.reqid());
        request.set_bound(latencyBound_);
        request.set_proxyid(proxyIds_[id]);
        request.set_sendtime(GetMicrosecondTimestamp());

        std::string msg = request.SerializeAsString();
        msgHdr->msgType = MessageType::CLIENT_REQUEST;
        msgHdr->msgLen = msg.length();
        memcpy(buffer + sizeof(MessageHeader), msg.c_str(), msg.length());
        if (clientAddrs_.get(request.clientid()) == NULL) {
          struct sockaddr_in* addr = new sockaddr_in(receiverAddr);
          clientAddrs_.assign(request.clientid(), addr);
        }

        // Send to every replica
        for (int i = 0; i < replicaNum_; i++) {
          // uint32_t generateProxyId = (uint32_t)(proxyIds_[id] >> 32u);
          // struct sockaddr_in* replicaAddr =
          //     replicaAddrs_[i][generateProxyId % replicaAddrs_[i].size()];
          struct sockaddr_in* replicaAddr =
              replicaAddrs_[i][proxyIds_[id] % replicaAddrs_[i].size()];

          sendto(forwardFds_[id], buffer,
                 msgHdr->msgLen + sizeof(MessageHeader), 0,
                 (struct sockaddr*)replicaAddr, sizeof(sockaddr_in));
        }
        // Log* litem = new Log();
        // litem->clientId_ = request.clientid();
        // litem->reqId_ = request.reqid();
        // litem->clientTime_ = request.clienttime();
        // litem->proxyTime_ = request.sendtime();
        // litem->deadline_ = request.sendtime() + request.bound();
        // logs.assign(reqKey, litem);
        // litem->proxyEndProcessTime_ = GetMicrosecondTimestamp();
        // LOG(INFO) << "id=" << id << "\t"
        //           << "cid=" << request.clientid() << "\t" << request.reqid();

        // forwardCnt++;
        // if (forwardCnt == 1) {
        //   startTime = GetMicrosecondTimestamp();
        // } else if (forwardCnt % 100 == 0) {
        //   endTime = GetMicrosecondTimestamp();
        //   float rate = 100 / ((endTime - startTime) * 1e-6);
        //   LOG(INFO) << "Forward-Id=" << id << "\t"
        //             << "count =" << forwardCnt << "\t"
        //             << "rate=" << rate << " req/sec"
        //             << "\t"
        //             << "req is <" << request.clientid() << ","
        //             << request.reqid() << ">";
        //   startTime = endTime;
        // }
      }
    }
  }
}

void Proxy::CreateContext() {
  running_ = true;
  int shardNum = proxyConfig_.proxyShardNum;
  uint32_t proxyId = proxyConfig_.proxyId;
  forwardFds_.resize(shardNum, -1);
  requestReceiveFds_.resize(shardNum, -1);
  replyFds_.resize(shardNum, -1);
  proxyIds_.resize(shardNum, proxyId);
  latencyBound_ = proxyConfig_.replicaInitialOwd;
  maxOWD_ = proxyConfig_.proxyMaxOwd;
  for (int i = 0; i < shardNum; i++) {
    forwardFds_[i] = CreateSocketFd(proxyConfig_.proxyIp,
                                    proxyConfig_.proxyReplyPortBase + i);
    requestReceiveFds_[i] = CreateSocketFd(
        proxyConfig_.proxyIp, proxyConfig_.proxyRequestPortBase + i);
    replyFds_[i] = CreateSocketFd("", -1);
    proxyIds_[i] = ((proxyIds_[i] << 32) | (uint32_t)i);
  }
  committedReplyMap_.resize(shardNum);
  logMap_.resize(shardNum);

  replicaNum_ = proxyConfig_.replicaIps.size();
  assert(replicaNum_ % 2 == 1);
  f_ = replicaNum_ / 2;

  replicaSyncedPoints_.resize(shardNum);
  for (int i = 0; i < shardNum; i++) {
    replicaSyncedPoints_[i].assign(replicaNum_, CONCURRENT_MAP_START_INDEX);
  }

  fastQuorum_ = (f_ % 2 == 1) ? (f_ + (f_ + 1) / 2 + 1) : (f_ + f_ / 2 + 1);
  replicaAddrs_.resize(replicaNum_);
  for (int i = 0; i < replicaNum_; i++) {
    std::string replicaIP = proxyConfig_.replicaIps[i];
    for (int j = 0; j < proxyConfig_.replicaReceiverShards; j++) {
      struct sockaddr_in* addr = new sockaddr_in();
      bzero(addr, sizeof(struct sockaddr_in));
      addr->sin_family = AF_INET;
      addr->sin_port = htons(proxyConfig_.replicaReceiverPort + j);
      addr->sin_addr.s_addr = inet_addr(replicaIP.c_str());
      replicaAddrs_[i].push_back(addr);
    }
  }
}

}  // namespace nezha


================================================
FILE: proxy/proxy.h
================================================
#include <yaml-cpp/yaml.h>
#include <fstream>
#include "lib/utils.h"
#include "proto/nezha_proto.pb.h"
#include "proxy_config.h"

namespace nezha {
using namespace nezha::proto;

/**
 * Refer to proxy_run.cc, the runnable program only needs to instantiate a
 * Proxy object with a configuration file. Then it calls Run() method to run
 * and calls Terminate() method to stop
 */

class Proxy {
 private:
  /** All the configuration parameters for this proxy are included in
   * proxyConfig_*/
  ProxyConfig proxyConfig_;
  /** Each thread is given a unique name (key) */
  std::map<std::string, std::thread*> threadPool_;

  /** Launch all the threads, these threads are mainly categorized into three
   * classes:
   * (1) ForwardRequestsTd, which receives client requests and
   * multicast to replicas;
   * (2) CheckQuorumTd, which receives replica replies and
   * check whether the corresponding request has been committed (use
   * isQuorumReady), if so, send a reply to the client;
   * (3) CalculateLatencyBoundTd, which caluldates the latency bound
   *
   * (1) and (2) handles most workload and is parallelized, and the parallism
   * degree is decided by the parameter defined in proxyConfig_ (i.e.,
   * shard-num).
   *
   * (1) and (2) are paired, i.e., we launch equal number of
   * ForwardRequestsTds and CheckQuorumTds. The requests multicast by
   * ForwardRequestsTd-i will be tracked and quorum-checked by CheckQuorumTd-i
   */
  void LaunchThreads();
  void ForwardRequestsTd(const int id = -1);
  void CheckQuorumTd(const int id = -1);
  void CalculateLatencyBoundTd();

  /** LogTd is just used to collect some performance stats. It is not necessary
   * in the release version */
  void LogTd();

  /** Create/Initialize all the necessary variables */
  void CreateContext();

  /** Check whether a quorum has been formed for the request to be committed.
   * If the request has been committed, it returns the reply message, which will
   * be delievered to the client; otherwise, it returns NULL
   */
  Reply* isQuorumReady(std::vector<uint64_t>& repliedSyncPoint,
                       std::map<uint32_t, Reply>& quorum);

  /** Tools function: given ip and port, create a socket fd. If ip is not empty,
   * the socket will be binded to the <ip:port>   */
  int CreateSocketFd(const std::string& ip = "", const int port = -1);

  /** Flag to Run/Terminate threads */
  std::atomic<bool> running_;

  /** Each CheckQuorumTd thread uses the socket fd in replyFds_, based on its
   * id, to send reply to clients
   */
  std::vector<int> replyFds_;

  /** Each ForwardRequestsTd thread uses the socket fd in forwardFds_, based on
   * its id, to multicast requests to replicas
   */
  std::vector<int> forwardFds_;

  /** Each ForwardRequestsTd thread uses the socket fd in requestReceiveFds_,
   * based on its id, to receive requests from clients
   */
  std::vector<int> requestReceiveFds_;

  /** We create a unique id for each ForwardRequestsTd, so that replicas can
   * derive which CheckQuorumTd should receive the reply messages */
  std::vector<uint64_t> proxyIds_;

  /** CalculateLatencyBoundTd updates latencyBound_ and concurrently
   * ForwardRequestsTds read it and included in request messages */
  std::atomic<uint32_t> latencyBound_;

  /** Upper bound of the estimated latencyBound_, used to clamp the bound,
   * details in ``Adapative Latency Bound`` para of Sec 4 of our paper */
  uint32_t maxOWD_;

  /** CheckQuorumTd threads pass <replicaId, owd> samples to
   * CalculateLatencyBoundTd */
  ConcurrentQueue<std::pair<uint32_t, uint32_t>> owdQu_;  //

  int replicaNum_;
  int f_;          /** replicaNum_ =2f_+1 */
  int fastQuorum_; /** fastQuorum_ = f_+ceiling(f_/2)+1 */

  /** Just used to collect logs, can be deleted in the release version*/
  struct Log {
    uint32_t replicaId_;
    uint32_t clientId_;
    uint32_t reqId_;
    uint64_t clientTime_;
    uint64_t proxyTime_;
    uint64_t proxyEndProcessTime_;
    uint64_t recvTime_;
    uint64_t deadline_;
    uint64_t fastReplyTime_;
    uint64_t slowReplyTime_;
    uint64_t proxyRecvTime_;
    uint32_t commitType_;

    Log(uint32_t rid = 0, uint32_t cId = 0, uint32_t reqId = 0,
        uint64_t ctime = 0, uint64_t ptime = 0, uint64_t pedtime = 0,
        uint64_t rtime = 0, uint64_t ddl = 0, uint64_t fttime = 0,
        uint64_t swtime = 0, uint64_t prcvt = 0, uint32_t cmtt = 0)
        : replicaId_(rid),
          clientId_(cId),
          reqId_(reqId),
          clientTime_(ctime),
          proxyTime_(ptime),
          recvTime_(rtime),
          deadline_(ddl),
          fastReplyTime_(fttime),
          slowReplyTime_(swtime),
          proxyRecvTime_(prcvt),
          commitType_(cmtt) {}
    std::string ToString() {
      return std::to_string(replicaId_) + "," + std::to_string(clientId_) +
             "," + std::to_string(reqId_) + "," + std::to_string(clientTime_) +
             "," + std::to_string(proxyTime_) + "," +
             std::to_string(proxyEndProcessTime_) + "," +
             std::to_string(recvTime_) + "," + std::to_string(deadline_) + "," +
             std::to_string(fastReplyTime_) + "," +
             std::to_string(slowReplyTime_) + "," +
             std::to_string(proxyRecvTime_) + "," + std::to_string(commitType_);
    }
  };
  ConcurrentQueue<Log> logQu_;

  /** Vector of replica's addresses
   * Since replicas can have multiple receiver shards, we use a two-dimensional
   * vector.
   *
   * replicaAddrs_[i] records the addresses of replica-i, which can receive
   * requests replicaAddrs_[i][j] is the address of the jth receiver shard of
   * replica-i.
   */
  std::vector<std::vector<struct sockaddr_in*>> replicaAddrs_;

  /**
   * After ForwardRequestTd receives client request, it records the address of
   * the client, so that later the correspoinding CheckQuorumTd can know which
   * address should recieve the commit reply.
   */
  ConcurrentMap<uint32_t, struct sockaddr_in*> clientAddrs_;

  /**
   * As an optimization, proxies also mantain a cache to record the commit reply
   * messages for those already-commited requests. In this way, when clients
   * retry the request which has already been committed, the proxy can direct
   * resend the reply, instead of adding additional burden to the replicas
   */

  std::vector<std::unordered_map<uint64_t, Reply*>> committedReplyMap_;

  std::vector<ConcurrentMap<uint64_t, uint64_t>> sendTimeMap_;

  std::vector<ConcurrentMap<uint64_t, Log*>> logMap_;

 public:
  /** Proxy accept a config file, which contains all the necessary information
   * to instantiate the object, then it can call Run method
   *  */
  Proxy(const std::string& configFile = "../configs/nezha-proxy-config.yaml");
  ~Proxy();
  void Run();
  void Terminate();

  /** Tentative */
  std::vector<std::vector<uint64_t>> replicaSyncedPoints_;
};

}  // namespace nezha

================================================
FILE: proxy/proxy_config.h
================================================
#include <glog/logging.h>
#include <stdint.h>
#include <yaml-cpp/yaml.h>
#include <string>
#include <vector>

struct ProxyConfig {
  int proxyId;
  std::string proxyIp;
  int proxyShardNum;
  uint32_t proxyMaxOwd;
  int proxyRequestPortBase;
  int proxyReplyPortBase;

  std::vector<std::string> replicaIps;
  uint32_t replicaInitialOwd;
  int replicaReceiverPort;
  int replicaReceiverShards;

  // Parses yaml file configFilename and fills in fields of ProxyConfig
  // accordingly. Returns an error message or "" if there are no errors.
  std::string parseConfig(std::string configFilename) {
    YAML::Node config;
    try {
      config = YAML::LoadFile(configFilename);
    } catch (const YAML::BadFile& e) {
      return "Error loading config file:" + e.msg + ".";
    }
    LOG(INFO) << "Using config:\n " << config;

    std::string key;  // Keep track of current key for better error messages
    try {
      key = "replica-ips";
      for (uint32_t i = 0; i < config[key].size(); i++) {
        replicaIps.push_back(config[key][i].as<std::string>());
      }
      key = "replica-receiver-shards";
      replicaReceiverShards = config[key].as<int>();
      key = "replica-initial-owd";
      replicaInitialOwd = config[key].as<uint32_t>();
      key = "replica-receiver-port";
      replicaReceiverPort = config[key].as<int>();

      key = "proxy-id";
      proxyId = config[key].as<int>();
      key = "proxy-ip";
      proxyIp = config[key].as<std::string>();
      key = "proxy-shard-num";
      proxyShardNum = config[key].as<int>();
      key = "proxy-max-owd";
      proxyMaxOwd = config[key].as<uint32_t>();
      key = "proxy-request-port-base";
      proxyRequestPortBase = config[key].as<int>();
      key = "proxy-reply-port-base";
      proxyReplyPortBase = config[key].as<int>();

      return "";
    } catch (const YAML::BadConversion& e) {
      if (config[key]) {
        return "Error parsing config field " + key + ": " + e.msg + ".";
      } else {
        return "Error parsing config field " + key + ": key not found.";
      }
    } catch (const std::exception& e) {
      return "Error parsing config field " + key + ": " + e.what() + ".";
    }
  }
};

================================================
FILE: proxy/proxy_run.cc
================================================
#include "proxy/proxy.h"
DEFINE_string(config, "nezhav2/config/nezha-proxy-config-0.yaml", "The config file for the proxy");

nezha::Proxy* proxy = NULL;
void Terminate(int para) {
    proxy->Terminate();
}
int main(int argc, char* argv[]) {
    gflags::ParseCommandLineFlags(&argc, &argv, true);
    google::InitGoogleLogging(argv[0]);
    FLAGS_logtostderr = 1;
    signal(SIGINT, Terminate);
    proxy = new nezha::Proxy(FLAGS_config);
    proxy->Run();
    delete proxy;
}


================================================
FILE: replica/BUILD
================================================
cc_library(
    name = "replica_config",
    hdrs = ["replica_config.h"],
    deps = [
        "@com_github_jbeder_yaml_cpp//:yaml-cpp",
    ],
)


cc_library(
    name = "replica_class",
    srcs = ["replica.cc"],
    hdrs = [
        "replica.h",
    ],
    deps = [
        ":replica_config",
        "//proto:nezha_cc_proto",
        "//lib:utils",
        "@com_github_preshing_junction//:libjunction",
        "@com_github_enki_libev//:libev",
        "@boost//:uuid",
    ],
)


cc_binary(
    name = "nezha_replica",
    srcs = ["replica_run.cc"],
    deps = [
        ":replica_class",
    ],
)


================================================
FILE: replica/replica.cc
================================================
#include "replica/replica.h"

namespace nezha {
// #define GJK_DEBUG
#ifdef GJK_DEBUG
#define ASSERT(x) assert(x)
#else
#define ASSERT(x) \
  {}
#endif

Replica::Replica(const std::string& configFile, bool isRecovering)
    : viewId_(0), lastNormalView_(0) {
  repliedSyncPoint_ = new std::atomic<uint32_t>[maxProxyNum_];
  for (uint32_t i = 0; i < maxProxyNum_; i++) {
    repliedSyncPoint_[i] = CONCURRENT_MAP_START_INDEX - 1;
  }
  LOG(INFO) << maxProxyNum_ << " proxy replied sync point has been initialized";

  lastAskMissedIndexTime_ = 0;
  lastAskMissedRequestTime_ = 0;
  syncedLogEntryHead_ = new LogEntry();
  syncedLogEntryHead_->logId = CONCURRENT_MAP_START_INDEX - 1;
  syncedLogEntryHead_->body.deadline = 0;
  syncedLogEntryHead_->body.reqKey = 0;
  unSyncedLogEntryHead_ = new LogEntry();
  unSyncedLogEntryHead_->logId = CONCURRENT_MAP_START_INDEX - 1;
  unSyncedLogEntryHead_->body.deadline = 0;
  unSyncedLogEntryHead_->body.reqKey = 0;

  // Load Config
  std::string error = replicaConfig_.parseConfig(configFile);
  if (error != "") {
    LOG(ERROR) << "Error loading replica config. " << error << " Exiting";
    exit(1);
  }
  if (isRecovering) {
    status_ = ReplicaStatus::RECOVERING;
    LOG(INFO) << "Recovering ...";
  } else {
    status_ = ReplicaStatus::NORMAL;
  }
  LOG(INFO) << "Replica Status " << status_;
  CreateContext();
  LOG(INFO) << "viewId_=" << viewId_ << "\treplicaId=" << replicaId_
            << "\treplicaNum=" << replicaNum_ << "\tkeyNum=" << keyNum_;
}

Replica::~Replica() {
  status_ = ReplicaStatus::TERMINATED;
  for (auto& kv : threadPool_) {
    delete kv.second;
    VLOG(2) << "Deleted\t" << kv.first;
  }

  // TODO: A more elegant way is to reclaim or dump all logs before exit
  // For now, it is fine because all the memory is freed after the process is
  // terminated
}

void Replica::Run() {
  // Master thread run
  masterContext_->Register(endPointType_);
  if (status_ == ReplicaStatus::RECOVERING) {
    masterContext_->endPoint_->RegisterTimer(crashVectorRequestTimer_);
  } else if (status_ == ReplicaStatus::NORMAL) {
    if (!AmLeader()) {
      masterContext_->endPoint_->RegisterTimer(heartbeatCheckTimer_);
    }
    masterContext_->endPoint_->RegisterTimer(periodicSyncTimer_);
  }
  // Launch worker threads (based on config)
  LaunchThreads();

  masterContext_->endPoint_->LoopRun();
  VLOG(2) << "Break LoopRun";

  // Wait until all threads return
  for (auto& kv : threadPool_) {
    VLOG(2) << "Joining " << kv.first;
    kv.second->join();
    VLOG(2) << "Join Complete \t" << kv.first;
  }
}

void Replica::Terminate() {
  do {
    status_ = ReplicaStatus::TERMINATED;
    waitVar_.notify_all();
    // LOG(INFO) << "activeWorkerNum_=" << activeWorkerNum_;
  } while (activeWorkerNum_ > 0);
}

void Replica::CreateContext() {
  endPointType_ = replicaConfig_.endpointType;
  replicaId_ = replicaConfig_.replicaId;
  replicaNum_ = replicaConfig_.replicaIps.size();
  keyNum_ = replicaConfig_.keyNum;
  lastReleasedEntryByKeys_.assign(keyNum_, {0ul, 0ul});
  // Since ConcurrentMap reserves 0 and 1, log-id starts from from 2
  // So these variables are initialized as 2-1=1

  maxSyncedLogEntry_ = syncedLogEntryHead_;
  maxUnSyncedLogEntry_ = unSyncedLogEntryHead_;
  minUnSyncedLogEntry_ = unSyncedLogEntryHead_;

  maxSyncedLogEntryByKey_.assign(keyNum_, NULL);
  maxUnSyncedLogEntryByKey_.assign(keyNum_, NULL);
  minUnSyncedLogEntryByKey_.assign(keyNum_, NULL);

  committedLogId_ = CONCURRENT_MAP_START_INDEX - 1;
  toCommitLogId_ = CONCURRENT_MAP_START_INDEX - 1;

  // Create master endpoints and context
  std::string ip = replicaConfig_.replicaIps[replicaId_.load()];
  int port = replicaConfig_.masterPort;
  int monitorPeriodMs = replicaConfig_.monitorPeriodMs;
  Endpoint* masterEP = CreateEndpoint(endPointType_, ip, port, true);
  auto masterCallBack = [](MessageHeader* msgHeader, char* msgBuffer,
                           Address* sender, void* ctx) {
    ((Replica*)ctx)->ReceiveMasterMessage(msgHeader, msgBuffer);
  };
  // Register a timer to monitor replica status
  Timer* masterMonitorTimer = new Timer(
      [](void* ctx, void* receiverEP) {
        if (((Replica*)ctx)->status_ == ReplicaStatus::TERMINATED) {
          // Master thread will only break its loop when status comes to
          // TERMINATED
          ((Endpoint*)receiverEP)->LoopBreak();
        }
      },
      monitorPeriodMs, this);
  masterContext_ =
      new ReceiverContext(masterEP, this, masterCallBack, masterMonitorTimer);

  LOG(INFO) << "Master Created";
  // Create request-receiver endpoints and context
  requestContext_.resize(replicaConfig_.receiverShards);
  for (int i = 0; i < replicaConfig_.receiverShards; i++) {
    int port = replicaConfig_.receiverPort + i;
    Endpoint* requestEP = CreateEndpoint(endPointType_, ip, port);
    // Register a request handler to this endpoint
    auto requestHandlerFunc = [](MessageHeader* msgHeader, char* msgBuffer,
                                 Address* sender, void* ctx) {
      ((Replica*)ctx)->ReceiveClientRequest(msgHeader, msgBuffer, sender);
    };
    // Register a timer to monitor replica status
    Timer* requestEPMonitorTimer = new Timer(
        [](void* ctx, void* receiverEP) {
          if (((Replica*)ctx)->status_ != ReplicaStatus::NORMAL) {
            ((Endpoint*)receiverEP)->LoopBreak();
          }
        },
        monitorPeriodMs, this);
    requestContext_[i] = new ReceiverContext(
        requestEP, this, requestHandlerFunc, requestEPMonitorTimer);
  }

  LOG(INFO) << "requestContext_ Created";
  // (Leader) Use these endpoints to broadcast indices to followers
  for (int i = 0; i < replicaConfig_.indexSyncShards; i++) {
    indexSender_.push_back(new UDPSocketEndpoint());
  }
  indexAcker_ = CreateEndpoint(endPointType_);
  indexRequester_ = CreateEndpoint(endPointType_);
  reqRequester_ = CreateEndpoint(endPointType_);
  for (uint32_t i = 0; i < replicaNum_; i++) {
    std::string ip = replicaConfig_.replicaIps[i];
    int indexPort = replicaConfig_.indexSyncPort;
    indexReceiver_.push_back(new Address(ip, indexPort));
    int indexAskPort = replicaConfig_.indexAskPort;
    indexAskReceiver_.push_back(new Address(ip, indexAskPort));
    int requestAskPort = replicaConfig_.requestAskPort;
    requestAskReceiver_.push_back(new Address(ip, requestAskPort));
    int masterPort = replicaConfig_.masterPort;
    masterReceiver_.push_back(new Address(ip, masterPort));
  }
  // (Followers:) Create index-sync endpoint to receive indices
  port = replicaConfig_.indexSyncPort;
  Endpoint* idxSyncEP = CreateEndpoint(endPointType_, ip, port);
  // Register a msg handler to this endpoint to handle index sync messages
  auto idxHandleFunc = [](MessageHeader* msgHeader, char* msgBuffer,
                          Address* sender, void* ctx) {
    ((Replica*)ctx)->ReceiveIndexSyncMessage(msgHeader, msgBuffer);
  };

  // Register a timer to monitor replica status
  Timer* idxSyncMonitorTimer = new Timer(
      [](void* ctx, void* receiverEP) {
        if (((Replica*)ctx)->status_ != ReplicaStatus::NORMAL) {
          ((Endpoint*)receiverEP)->LoopBreak();
        }
      },
      monitorPeriodMs, this);

  indexSyncContext_ =
      new ReceiverContext(idxSyncEP, this, idxHandleFunc, idxSyncMonitorTimer);

  LOG(INFO) << "indexSyncContext_ Created";

  // Create an endpoint to handle others' requests for missed index
  port = replicaConfig_.indexAskPort;
  Endpoint* missedIdxEP = CreateEndpoint(endPointType_, ip, port);
  // Register message handler
  auto missedIdxHandleFunc = [](MessageHeader* msgHeader, char* msgBuffer,
                                Address* sender, void* ctx) {
    ((Replica*)ctx)->ReceiveAskMissedIdx(msgHeader, msgBuffer);
  };

  // Register a timer to monitor replica status
  Timer* missedIdxAckMonitorTimer = new Timer(
      [](void* ctx, void* receiverEP) {
        if (((Replica*)ctx)->status_ != ReplicaStatus::NORMAL) {
          ((Endpoint*)receiverEP)->LoopBreak();
        }
      },
      monitorPeriodMs, this);

  missedIndexAckContext_ = new ReceiverContext(
      missedIdxEP, this, missedIdxHandleFunc, missedIdxAckMonitorTimer);

  LOG(INFO) << "missedIndexAckContext_ Created";

  // Create an endpoint to handle others' requests for missed req
  port = replicaConfig_.requestAskPort;
  Endpoint* missedReqAckEP = CreateEndpoint(endPointType_, ip, port);
  // Register message handler
  auto missedReqAckHandleFunc = [](MessageHeader* msgHeader, char* msgBuffer,
                                   Address* sender, void* ctx) {
    ((Replica*)ctx)->ReceiveAskMissedReq(msgHeader, msgBuffer);
  };
  // Register a timer to monitor replica status
  Timer* missedReqAckMonitorTimer = new Timer(
      [](void* ctx, void* receiverEP) {
        if (((Replica*)ctx)->status_ != ReplicaStatus::NORMAL) {
          ((Endpoint*)receiverEP)->LoopBreak();
        }
      },
      monitorPeriodMs, this);

  missedReqAckContext_ = new ReceiverContext(
      missedReqAckEP, this, missedReqAckHandleFunc, missedReqAckMonitorTimer);

  LOG(INFO) << "missedReqAckContext_ Created";

  // Create Record Qus and Maps
  recordMap_.resize(replicaConfig_.recordShards);
  recordQu_.resize(replicaConfig_.recordShards);

  // Create track entry for trackThread
  trackedEntry_.assign(replicaConfig_.trackShards, maxSyncedLogEntry_);

  // Create reply endpoints
  int replyShardNum = replicaConfig_.replyShards;
  for (int i = 0; i < replyShardNum; i++) {
    fastReplySender_.push_back(CreateEndpoint(endPointType_));
    slowReplySender_.push_back(CreateEndpoint(endPointType_));
  }
  // Create reply queues (one queue per fast/slow reply thread)
  fastReplyQu_.resize(replyShardNum);
  slowReplyQu_.resize(replyShardNum);

  // Create CrashVector Context
  std::vector<uint32_t> cvVec(replicaNum_, 0);
  CrashVectorStruct* cv = new CrashVectorStruct(cvVec, 2);
  crashVector_.assign(cv->version_, cv);
  /** Thw related threads using crash vectors are:
   * (1) master (1 thread)
   * (2) FastReplyThread(s) (replyShardNum threads) */
  crashVectorVecSize_ = 1 + replyShardNum;
  crashVectorInUse_ = new std::atomic<CrashVectorStruct*>[crashVectorVecSize_];
  for (uint32_t i = 0; i < crashVectorVecSize_; i++) {
    crashVectorInUse_[i] = cv;
  }

  // Create other useful timers
  heartbeatCheckTimer_ = new Timer(
      [](void* ctx, void* receiverEP) {
        // Followers use this timer to check leader's heartbeat
        ((Replica*)ctx)->CheckHeartBeat();
      },
      monitorPeriodMs, this);

  indexAskTimer_ = new Timer(
      [](void* ctx, void* receiverEP) { ((Replica*)ctx)->AskMissedIndex(); },
      replicaConfig_.indexAskPeriodMs, this);
  roundRobinIndexAskIdx_ = 0;
  // Initially, no missed indices, so we make first > second
  missedIndices_ = {1, 0};

  requestAskTimer_ = new Timer(
      [](void* ctx, void* receiverEP) { ((Replica*)ctx)->AskMissedRequest(); },
      replicaConfig_.requestAskPeriodMs, this);
  roundRobinRequestAskIdx_ = 0;
  missedReqKeys_.clear();

  viewChangeTimer_ = new Timer(
      [](void* ctx, void* receiverEP) {
        ((Replica*)ctx)->BroadcastViewChange();
      },
      replicaConfig_.viewChangePeriodMs, this);
  roundRobinProcessIdx_ = 0;

  periodicSyncTimer_ = new Timer(
      [](void* ctx, void* receiverEP) {
        ((Replica*)ctx)->SendSyncStatusReport();
      },
      replicaConfig_.syncReportPeriodMs, this);

  requestTrasnferBatch_ = replicaConfig_.requestTransferBatch;
  indexTransferBatch_ = replicaConfig_.indexTransferBatch;
  requestKeyTransferBatch_ = replicaConfig_.requestKeyTransferBatch;

  stateTransferTimer_ = new Timer(
      [](void* ctx, void* receiverEP) {
        ((Replica*)ctx)->SendStateTransferRequest();
      },
      replicaConfig_.stateTransferPeriodMs, this);

  stateTransferTimeout_ = replicaConfig_.stateTransferTimeoutMs;

  crashVectorRequestTimer_ = new Timer(
      [](void* ctx, void* receiverEP) {
        ((Replica*)ctx)->BroadcastCrashVectorRequest();
      },
      replicaConfig_.crashVectorRequestPeriodMs, this);

  recoveryRequestTimer_ = new Timer(
      [](void* ctx, void* receiverEP) {
        ((Replica*)ctx)->BroadcastRecoveryRequest();
      },
      replicaConfig_.recoveryRequestPeriodMs, this);

  movingPercentile_ = replicaConfig_.movingPercentile;
  slidingWindowLen_ = replicaConfig_.owdEstimationWindow;

  // Signal variable for garbage collection (of followers)
  reclaimTimeout_ = replicaConfig_.reclaimTimeoutMs;
  safeToClearUnSyncedLogId_ = new std::atomic<uint32_t>[replyShardNum + 1];
  safeToClearLateBufferLogId_ = CONCURRENT_MAP_START_INDEX - 1;
  for (int i = 0; i <= replyShardNum; i++) {
    safeToClearUnSyncedLogId_[i] = CONCURRENT_MAP_START_INDEX - 1;
  }
  prepareToClearLateBufferLogId_ = CONCURRENT_MAP_START_INDEX - 1;
  prepareToClearUnSyncedLogId_ = CONCURRENT_MAP_START_INDEX - 1;
}

void Replica::ResetContext() {
  // Clear queues
  for (uint32_t i = 0; i < fastReplyQu_.size(); i++) {
    LogEntry* entry;
    while (fastReplyQu_[i].try_dequeue(entry)) {
    }
    while (slowReplyQu_[i].try_dequeue(entry)) {
    }
    // Don't worry about memory leakage, the memory pointed by these in-queue
    // pointers have already been cleaned or will be cleaned according to their
    // Conucurrent maps
  }
  LogEntry* entry;
  while (processQu_.try_dequeue(entry)) {
    delete entry;
  }
  for (uint32_t i = 0; i < recordQu_.size(); i++) {
    RequestBody* rb;
    while (recordQu_[i].try_dequeue(rb)) {
      delete rb;
    }
  }

  // TODO: Clear LateBuffer

  // Clear Early Buffer
  while (earlyBuffer_.empty() == false) {
    LogEntry* entry = earlyBuffer_.begin()->second;
    delete entry;
    earlyBuffer_.erase(earlyBuffer_.begin());
  }

  // Reset lastReleasedEntryByKeys_, no need to care about UnSyncedLogs, because
  // they are all cleared
  for (uint32_t key = 0; key < keyNum_; key++) {
    if (maxSyncedLogEntryByKey_[key]) {
      lastReleasedEntryByKeys_[key] = {
          maxSyncedLogEntryByKey_[key]->body.deadline,
          maxSyncedLogEntryByKey_[key]->body.reqKey};

    } else {
      lastReleasedEntryByKeys_[key] = {0ul, 0ul};
    }
  }

  // Clear UnSyncedLogs
  minUnSyncedLogEntry_ = unSyncedLogEntryHead_;
  maxUnSyncedLogEntry_ = unSyncedLogEntryHead_;
  minUnSyncedLogEntryByKey_.clear();
  maxUnSyncedLogEntryByKey_.clear();
  minUnSyncedLogEntryByKey_.assign(keyNum_, NULL);
  maxUnSyncedLogEntryByKey_.assign(keyNum_, NULL);

  // Reset Index-Sync related stuff
  roundRobinIndexAskIdx_ = 0;
  missedIndices_ = {1, 0};
  roundRobinRequestAskIdx_ = 0;
  missedReqKeys_.clear();
  roundRobinProcessIdx_ = 0;
  pendingIndexSync_.clear();

  // Reset stateTransfer related stuff
  stateTransferIndices_.clear();
  viewChangeSet_.clear();
  crashVectorReplySet_.clear();
  recoveryReplySet_.clear();
  syncStatusSet_.clear();

  // Reset trackedEntry
  trackedEntry_.assign(trackedEntry_.size(), maxSyncedLogEntry_);

  // Reset OWD-Calc Related stuff
  slidingWindow_.clear();
  owdSampleNum_.clear();

  // Reset Master's timers
  // No need to worry about other timers: worker thread will unregister their
  // timers and msg handlers during LoopBreak
  masterContext_->endPoint_->UnRegisterAllTimers();
  masterContext_->endPoint_->RegisterTimer(masterContext_->monitorTimer_);
  if (!AmLeader()) {
    // Start checking leader's heartbeat from now on
    lastHeartBeatTime_ = GetMicrosecondTimestamp();
    masterContext_->endPoint_->RegisterTimer(heartbeatCheckTimer_);
  }
  masterContext_->endPoint_->RegisterTimer(periodicSyncTimer_);

  // Reset signal variable for garbage collection (of followers)
  safeToClearLateBufferLogId_ = CONCURRENT_MAP_START_INDEX - 1;
  for (uint32_t i = 0; i <= fastReplyQu_.size(); i++) {
    // The number of such counters is number of FastReplyThread_ + 1 (IndexRecv)
    safeToClearUnSyncedLogId_[i] = CONCURRENT_MAP_START_INDEX - 1;
  }
  prepareToClearLateBufferLogId_ = CONCURRENT_MAP_START_INDEX - 1;
  prepareToClearUnSyncedLogId_ = CONCURRENT_MAP_START_INDEX - 1;
}

void Replica::LaunchThreads() {
  activeWorkerNum_ = 0;  // Dynamic variable, used as semaphore
  totalWorkerNum_ = 0;   // Static variable to count number of workers
  // RequestReceive
  for (int i = 0; i < replicaConfig_.receiverShards; i++) {
    totalWorkerNum_++;
    std::thread* td = new std::thread(&Replica::ReceiveThread, this, i);
    std::string key("ReceiveThread-" + std::to_string(i));
    threadPool_[key] = td;
    LOG(INFO) << "Launched " << key << "\t" << td->native_handle();
  }

  // RequestRecord
  for (int i = 0; i < replicaConfig_.recordShards; i++) {
    totalWorkerNum_++;
    std::thread* td = new std::thread(&Replica::RecordThread, this, i);
    std::string key("RecordThread-" + std::to_string(i));
    threadPool_[key] = td;
    LOG(INFO) << "Launched " << key << "\t" << td->native_handle();
  }

  // RequestProcess
  if (replicaConfig_.processShards != 1) {
    LOG(ERROR) << "ProcessThread parallelization is not supported. "
                  "replicaConfig_->processShards must be 1.";
    exit(1);
  }
  for (int i = 0; i < replicaConfig_.processShards; i++) {
    totalWorkerNum_++;
    std::thread* td = new std::thread(&Replica::ProcessThread, this, i);
    std::string key("ProcessThread-" + std::to_string(i));
    threadPool_[key] = td;
    LOG(INFO) << "Launched " << key << "\t" << td->native_handle();
  }

  // RequestReply
  int replyShardNum = replicaConfig_.replyShards;
  for (int i = 0; i < replyShardNum; i++) {
    totalWorkerNum_++;
    std::thread* td =
        new std::thread(&Replica::FastReplyThread, this, i, i + 1);
    std::string key("FastReplyThread-" + std::to_string(i));
    threadPool_[key] = td;
    LOG(INFO) << "Launched " << key << "\t" << td->native_handle();
  }

  for (int i = 0; i < replyShardNum; i++) {
    totalWorkerNum_++;
    std::thread* td = new std::thread(&Replica::SlowReplyThread, this, i);
    std::string key("SlowReplyThread-" + std::to_string(i));
    threadPool_[key] = td;
    LOG(INFO) << "Launched " << key << "\t" << td->native_handle();
  }

  // Track
  for (int i = 0; i < replicaConfig_.trackShards; i++) {
    totalWorkerNum_++;
    std::thread* td = new std::thread(&Replica::TrackThread, this, i);
    std::string key("TrackThread-" + std::to_string(i));
    threadPool_[key] = td;
    LOG(INFO) << "Launched " << key << "\t" << td->native_handle();
  }

  // IndexSync
  for (int i = 0; i < replicaConfig_.indexSyncShards; i++) {
    totalWorkerNum_++;
    std::thread* td = new std::thread(&Replica::IndexSendThread, this, i,
                                      i + replyShardNum + 1);
    std::string key("IndexSendThread-" + std::to_string(i));
    threadPool_[key] = td;
    LOG(INFO) << "Launched " << key << "\t" << td->native_handle();
    if (!AmLeader()) {
      // follower only needs one sync thread
      break;
    }
  }

  totalWorkerNum_++;
  threadPool_["IndexRecvThread"] =
      new std::thread(&Replica::IndexRecvThread, this);
  LOG(INFO) << "Launched IndexRecvThread\t"
            << threadPool_["IndexRecvThread"]->native_handle();

  totalWorkerNum_++;
  threadPool_["IndexProcessThread"] =
      new std::thread(&Replica::IndexProcessThread, this);
  LOG(INFO) << "Launched IndexProcessThread\t"
            << threadPool_["IndexProcessThread"]->native_handle();

  totalWorkerNum_++;
  threadPool_["MissedIndexAckThread"] =
      new std::thread(&Replica::MissedIndexAckThread, this);
  LOG(INFO) << "Launched MissedIndexAckThread\t"
            << threadPool_["MissedIndexAckThread"]->native_handle();

  totalWorkerNum_++;
  threadPool_["MissedReqAckThread"] =
      new std::thread(&Replica::MissedReqAckThread, this);
  LOG(INFO) << "Launched MissedReqAckThread\t"
            << threadPool_["MissedReqAckThread"]->native_handle();

  // totalWorkerNum_++;
  // threadPool_["GarbageCollectThread"] =
  //     new std::thread(&Replica::GarbageCollectThread, this);
  // LOG(INFO) << "Launch  GarbageCollectThread "
  //           << threadPool_["GarbageCollectThread"]->native_handle();

  totalWorkerNum_++;
  threadPool_["OWDCalcThread"] = new std::thread(&Replica::OWDCalcThread, this);
  LOG(INFO) << "Launch  OWDCalcThread "
            << threadPool_["OWDCalcThread"]->native_handle();

  // totalWorkerNum_++;
  // threadPool_["LogHash"] = new std::thread(&Replica::LogHash, this);
  // LOG(INFO) << "Launched IndexRecvThread\t"
  //           << threadPool_["LogHash"]->native_handle();

  LOG(INFO) << "Master Thread " << pthread_self();

  LOG(INFO) << "totalWorkerNum_=" << totalWorkerNum_;
}

void Replica::ReceiveClientRequest(MessageHeader* msgHdr, char* msgBuffer,
                                   Address* sender) {
  if (msgHdr->msgType == MessageType::CLIENT_REQUEST) {
    Request request;
    if (request.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
      // tagQu_.enqueue(request.tagid());
      // Collect OWD sample
      uint64_t recvTime = GetMicrosecondTimestamp();
      if (recvTime > request.sendtime()) {
        owdQu_.enqueue(std::pair<uint64_t, uint32_t>(
            request.proxyid(), GetMicrosecondTimestamp() - request.sendtime()));
      }

      if (proxyAddressMap_.get(request.proxyid()) == 0) {
        Address* addr = new Address(*sender);
        /**  When one proxy sends the request, it needs to specify a proper
         **unique* proxyid related to one specific receiver thread on the
         *replica, so that this replica's different receiver threads will not
         *insert the same entry concurrently (otherwise, it may cause memory
         *leakage)
         *
         * In our proxy Implemention, each proxy machine has a unique id,
         with multiple shard. The machine-id concats shard-id becomes a unqiue
         *proxy-id, modulo replica-shard-num and then send to the replica
         *receiver
         **/
        proxyAddressMap_.assign(request.proxyid(), addr);
      }
      uint64_t reqKey = CONCAT_UINT32(request.clientid(), request.reqid());
      uint64_t deadline = request.sendtime() + request.bound();
      RequestBody* rb =
          new RequestBody(deadline, reqKey, request.key(), request.proxyid(),
                          request.command(), request.iswrite());
      uint32_t quId = (reqKey) % recordQu_.size();
      recordQu_[quId].enqueue(rb);

    } else {
      LOG(WARNING) << "Parse request fail";
    }

  } else {
    LOG(WARNING) << "Invalid Message Type " << (uint32_t)(msgHdr->msgType);
  }
}

void Replica::BlockWhenStatusIsNot(char targetStatus) {
  if (status_ != targetStatus) {
    activeWorkerNum_.fetch_sub(1);
    std::unique_lock<std::mutex> lk(waitMutext_);
    waitVar_.wait(lk, [this, targetStatus] {
      if (status_ == ReplicaStatus::TERMINATED || status_ == targetStatus) {
        // Unblock
        activeWorkerNum_.fetch_add(1);
        return true;
      } else {
        return false;
      }
    });
  }
}

void Replica::OWDCalcThread() {
  activeWorkerNum_.fetch_add(1);
  std::pair<uint64_t, uint32_t> owdSample;
  // uint32_t logCnt = 0;
  while (status_ != ReplicaStatus::TERMINATED) {
    BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
    while (owdQu_.try_dequeue(owdSample)) {
      uint64_t proxyId = owdSample.first;
      uint32_t owd = owdSample.second;
      owdSampleNum_[proxyId]++;
      if (slidingWindow_[proxyId].size() < slidingWindowLen_) {
        slidingWindow_[proxyId].push_back(owd);
      } else {
        slidingWindow_[proxyId][owdSampleNum_[proxyId] % slidingWindowLen_] =
            owd;
      }
      if (owdSampleNum_[proxyId] >= slidingWindowLen_) {
        std::vector<uint32_t> tmpSamples(slidingWindow_[proxyId]);
        sort(tmpSamples.begin(), tmpSamples.end());
        uint32_t movingEstimate =
            tmpSamples[slidingWindowLen_ * movingPercentile_];
        owdMap_.assign(proxyId, movingEstimate);
      }
    }
    // reduce CPU cost
    nanosleep((const struct timespec[]){{0, 1000000L}}, NULL);
  }
  uint32_t preVal = activeWorkerNum_.fetch_sub(1);
  LOG(INFO) << "OWDCalcThread Terminated: " << preVal - 1
            << " worker remaining";
}

void Replica::ReceiveThread(int id) {
  activeWorkerNum_.fetch_add(1);
  while (status_ != ReplicaStatus::TERMINATED) {
    BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
    requestContext_[id]->Register(endPointType_);
    requestContext_[id]->endPoint_->LoopRun();
  }
  uint32_t preVal = activeWorkerNum_.fetch_sub(1);
  LOG(INFO) << "ReceiveThread Terminated:" << preVal - 1 << " worker remaining";
}

void Replica::RecordThread(int id) {
  activeWorkerNum_.fetch_add(1);
  RequestBody* rb;
  // uint64_t sta, ed, cnt;
  // cnt = 0;
  while (status_ != ReplicaStatus::TERMINATED) {
    BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
    if (recordQu_[id].try_dequeue(rb)) {
      // cnt++;
      // if (cnt == 1) {
      //   sta = GetMicrosecondTimestamp();
      // }
      // if (cnt % 100000 == 0) {
      //   ed = GetMicrosecondTimestamp();
      //   float rate = 100000.0 / ((ed - sta) * 1e-6);
      //   sta = ed;
      //   LOG(INFO) << "id=" << id << "  record rate = " << rate << "\t"
      //             << "recordQuLen=" << recordQu_[id].size_approx() << "\t"
      //             << "processQuLen=" << processQu_.size_approx() << "\t"
      //             << "gap sample =" << ed - rb->deadline
      //             << " \t deadline=" << rb->deadline;
      // }
      /** The map is sharded by reqKey */
      LogEntry* duplicate = recordMap_[id].get(rb->reqKey);
      if (duplicate == NULL) {
        SHA_HASH dummy;
        LogEntry* newEntry = new LogEntry(*rb, dummy, dummy);
        recordMap_[id].assign(rb->reqKey, newEntry);
        processQu_.enqueue(newEntry);

      } else {
        // Duplicate requests
        processQu_.enqueue(duplicate);
      }
      delete rb;
    }
  }
  uint32_t preVal = activeWorkerNum_.fetch_sub(1);
  LOG(INFO) << "RecordThread-" << id << " Terminated: " << preVal - 1
            << " worker remaining";
}

void Replica::TrackThread(int id) {
  activeWorkerNum_.fetch_add(1);
  while (status_ != ReplicaStatus::TERMINATED) {
    BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
    if (trackedEntry_[id]->next) {
      LogEntry* next = trackedEntry_[id]->next;
      // LOG(INFO) << "next logId = " << next->logId;
      if (next->logId % trackedEntry_.size() == (uint32_t)id) {
        if (trackedEntry_[id]->logId >= CONCURRENT_MAP_START_INDEX) {
          uint32_t a = trackedEntry_[id]->logId;
          uint32_t b = next->logId;
          if (a + trackedEntry_.size() != b) {
            LOG(ERROR) << "myId = " << trackedEntry_[id]->logId << "\t"
                       << "sz = " << trackedEntry_.size() << "\t"
                       << "next=" << next->logId << "\t"
                       << trackedEntry_[id]->logId + trackedEntry_.size()
                       << "\t"
                       << (trackedEntry_[id]->logId + trackedEntry_.size() !=
                           next->logId)
                       << "\t"
                       << "a=" << a << "\t"
                       << "b=" << b;
          }
          ASSERT(trackedEntry_[id]->logId + trackedEntry_.size() ==
                 next->logId);
        }
        syncedLogEntryByLogId_.assign(next->logId, next);
        syncedLogEntryByReqKey_.assign(next->body.reqKey, next);
      }
      trackedEntry_[id] = next;
    }
    if (status_ == ReplicaStatus::TERMINATED) {
      LOG(INFO) << "Track Thread terminate ";
    }
  }
  uint32_t preVal = activeWorkerNum_.fetch_sub(1);
  LOG(INFO) << "TrackThread-" << id << " Terminated: " << preVal - 1
            << " worker remaining";
}

void Replica::ProcessThread(int id) {
  activeWorkerNum_.fetch_add(1);
  LogEntry* entry;
  std::set<uint64_t> tags;
  while (status_ != ReplicaStatus::TERMINATED) {
    BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
    bool amLeader = AmLeader();

    if (processQu_.try_dequeue(entry)) {
      if (entry->status == EntryStatus::INITIAL) {
        std::pair<uint64_t, uint64_t> earlyBufferRank(entry->body.deadline,
                                                      entry->body.reqKey);
        if (earlyBufferRank > lastReleasedEntryByKeys_[entry->body.opKey]) {
          earlyBuffer_[earlyBufferRank] = entry;
          entry->status = EntryStatus::IN_PROCESS;
        } else {
          // LOG(INFO) <<"Abnormal "<<entry->body.opKey
          //         <<"\t<"<<rankKey.first<<","<<rankKey.second<<">\t"
          //          <<"\t<"<<lastReleasedEntryByKeys_[entry->body.opKey].first
          //          <<","<<lastReleasedEntryByKeys_[entry->body.opKey].second
          //          <<">";
          // This entry cannot enter early buffer
          if (amLeader) {
            // Leader modifies its deadline
            entry->body.deadline =
                lastReleasedEntryByKeys_[entry->body.opKey].first + 1;
            earlyBufferRank.first = entry->body.deadline;
            earlyBuffer_[earlyBufferRank] = entry;
            entry->status = EntryStatus::IN_PROCESS;
          } else {
            // Followers leave it in late buffer
            entry->status = EntryStatus::IN_LATEBUFFER;
          }
        }
      } else if (entry->status == EntryStatus::IN_PROCESS ||
                 entry->status == EntryStatus::IN_LATEBUFFER) {
        continue;
      } else if (entry->status == EntryStatus::PROCESSED) {
        uint32_t quId = (entry->body.reqKey) % fastReplyQu_.size();
        fastReplyQu_[quId].enqueue(entry);
      } else if (entry->status == EntryStatus::TO_SLOW_REPLY) {
        uint32_t quId = (entry->body.reqKey) % slowReplyQu_.size();
        slowReplyQu_[quId].enqueue(entry);
      } else {
        LOG(WARNING) << "Unexpected Entry Status " << (int)(entry->status);
      }
    }

    // Polling early-buffer
    uint64_t nowTime = GetMicrosecondTimestamp();

    // This while loop is safe because there is only one processThread.
    // Parallelization of this thread is not supported.
    while (!earlyBuffer_.empty()) {
      LogEntry* nextEntry = earlyBuffer_.begin()->second;
      if (nowTime < nextEntry->body.deadline) {
        break;
      }
      if (nextEntry->body.isWrite) {
        lastReleasedEntryByKeys_[nextEntry->body.opKey] =
            earlyBuffer_.begin()->first;
      }
      ProcessRequest(nextEntry, amLeader, true, amLeader);
      earlyBuffer_.erase(earlyBuffer_.begin());
    }
  }
  uint32_t preVal = activeWorkerNum_.fetch_sub(1);
  LOG(INFO) << "ProcessThread Terminated: " << preVal - 1
            << " worker remaining";
}

void Replica::ProcessRequest(LogEntry* entry, const bool isSyncedReq,
                             const bool sendReply, const bool canExecute) {
  RequestBody& rb = entry->body;
  // Read Request do not contribute to hash
  entry->logHash = entry->entryHash =
      rb.isWrite ? CalculateHash(rb.deadline, rb.reqKey) : SHA_HASH();

  std::vector<LogEntry*>& maxEntryByKey =
      isSyncedReq ? maxSyncedLogEntryByKey_ : maxUnSyncedLogEntryByKey_;
  std::atomic<LogEntry*>& maxEntry =
      isSyncedReq ? maxSyncedLogEntry_ : maxUnSyncedLogEntry_;

  // The log id of the previous non-commutative entry in the synced logs
  entry->prevNonCommutative = maxEntryByKey[rb.opKey];
  if (entry->prevNonCommutative) {
    if (entry->prevNonCommutative->body.isWrite) {
      entry->prevNonCommutativeWrite = entry->prevNonCommutative;
    } else {
      entry->prevNonCommutativeWrite =
          entry->prevNonCommutative->prevNonCommutativeWrite;
    }
  }
  entry->prev = maxEntry;

  entry->result = (isSyncedReq && canExecute) ? ApplicationExecute(rb) : "";

  if (entry->prevNonCommutativeWrite) {
    entry->logHash.XOR(entry->prevNonCommutativeWrite->logHash);
  }
  ASSERT(entry->prev != NULL);
  entry->logId = entry->prev->logId + 1;
  entry->status = EntryStatus::PROCESSED;

  if (entry->prevNonCommutative) {
    entry->prevNonCommutative->nextNonCommutative = entry;
  }
  if (entry->prevNonCommutativeWrite && rb.isWrite) {
    entry->prevNonCommutativeWrite->nextNonCommutativeWrite = entry;
  }
  if (isSyncedReq == false && minUnSyncedLogEntryByKey_[rb.opKey] == NULL) {
    minUnSyncedLogEntryByKey_[rb.opKey] = entry;
  }
  entry->prev->next = entry;

  maxEntryByKey[rb.opKey] = entry;
  maxEntry = entry;

  if (sendReply) {
    uint32_t quId = (entry->body.reqKey) % fastReplyQu_.size();
    fastReplyQu_[quId].enqueue(entry);
  }
}

void Replica::FastReplyThread(int id, int cvId) {
  activeWorkerNum_.fetch_add(1);
  Reply reply;
  reply.set_replytype(MessageType::FAST_REPLY);
  reply.set_replicaid(replicaId_);
  CrashVectorStruct* cv = crashVectorInUse_[cvId];
  uint32_t replyNum = 0;
  // uint64_t startTime, endTime;
  while (status_ != ReplicaStatus::TERMINATED) {
    BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
    bool amLeader = AmLeader();
    safeToClearUnSyncedLogId_[id].store(prepareToClearUnSyncedLogId_.load());
    // Before encoding crashVector into hash, check whether the crashVector
    // (cv) is the freshest one
    CrashVectorStruct* masterCV = crashVectorInUse_[0].load();
    if (cv->version_ < masterCV->version_) {
      // My crash vector is stale, update it
      crashVectorInUse_[cvId] = masterCV;
      cv = masterCV;
    }
    LogEntry* entry = NULL;
    if (fastReplyQu_[id].try_dequeue(entry)) {
      reply.set_iswrite(entry->body.isWrite);
      reply.set_opkey(entry->body.opKey);
      replyNum++;
      // if (replyNum % 500000 == 0) {
      //   LOG(INFO) << id << "QuLen=" << fastReplyQu_[id].size_approx();
      // }
      Address* addr = proxyAddressMap_.get(entry->body.proxyId);
      if (!addr) {
        // The replica cannot find the address to send reply
        // This can happen in very trivial edge cases, e.g.,
        // Step 1: This replica misses the entry
        // Step 2: The other replica gives this replica the missing entry
        // Step 3: This replica has not received any entries from that proxy,
        // so it does not have any addr info Step 4: This replica wants to
        // send reply for this entry
        LOG(ERROR) << "Cannot find the address of the proxy "
                   << HIGH_32BIT(entry->body.proxyId) << "-"
                   << LOW_32BIT(entry->body.proxyId);
        continue;
      }
      reply.set_view(viewId_);
      reply.set_clientid(HIGH_32BIT(entry->body.reqKey));
      reply.set_reqid(LOW_32BIT(entry->body.reqKey));
      reply.set_result(entry->result);
      // If the owdMap_ does not have the proxyId (i.e. the owd for this
      // proxyId has not been estimated), it will return 0 (0 happens to be
      // the dummy value of protobuf, and the proxy will not consider it as an
      // estimated owd)
      reply.set_owd(owdMap_.get(entry->body.proxyId));

      SHA_HASH hash(entry->logHash);
      hash.XOR(cv->cvHash_);
      if (amLeader) {
        // Leader's logic is very easy: after XORing the crashVector and the
        // log entry hash together, it can directly reply
        reply.set_hash(hash.hash, SHA_DIGEST_LENGTH);
        reply.set_logid(entry->logId);
        reply.set_maxsyncedlogid(maxSyncedLogEntry_.load()->logId);

        uint32_t proxyMachineId = HIGH_32BIT(entry->body.proxyId);
        repliedSyncPoint_[proxyMachineId] = reply.maxsyncedlogid();
        fastReplySender_[id]->SendMsgTo(*addr, reply, MessageType::FAST_REPLY);

        // replyLogQu_.enqueue(reply);
        // LOG(INFO) << "Leader reply=" << reply.reqid() << "\t"
        //     << "opKey=" << entry->opKey << "\t"
        //     << "hash=" << hash.toString();
      } else {
        // But follower's hash is a bit complicated, because it needs to
        // consider both synced entries and unsynced entries, i.e. We need to
        // (1) eliminate the part to the left of sync-point and (2) use the
        // remaining part (to the right of sync-point) to XOR the part that
        // has already been synced

        // Let's first get the boundary, i.e. minUnSyncedLogId_ and
        // maxSyncedLogId_ maxSynced is always updated earlier than
        // minUnSynced, so we first get minUnSynced, and then get maxSynced,
        // this ensures minUnSynced is no fresher than maxSynced By contrast,
        // if we get the two variables in the reverse order, then we cannot be
        // sure which variable is fresher, that can lead to the missing of
        // some entries during hash calculation

        LogEntry* unsyncedEntry = minUnSyncedLogEntryByKey_[entry->body.opKey];
        LogEntry* syncedEntry = maxSyncedLogEntryByKey_[entry->body.opKey];

        if (syncedEntry && syncedEntry->body.isWrite == false) {
          // Only Write matters
          syncedEntry = syncedEntry->prevNonCommutativeWrite;
          assert(syncedEntry == NULL || syncedEntry->body.isWrite);
        }
        if (syncedEntry == NULL) {
          // The index sync process may have not been started, or may have not
          // catch up; Or the unsynced logs have been reclaimed by
          // GarbageCollectionThread (we have advanced
          // safeToClearUnSyncedLogId_) We cannot decide the sync-point, so
          // we directly reply with the XORed hash (similar to the leader)
          reply.set_hash(hash.hash, SHA_DIGEST_LENGTH);
          reply.set_maxsyncedlogid(maxSyncedLogEntry_.load()->logId);
          uint32_t proxyMachineId = HIGH_32BIT(entry->body.proxyId);
          repliedSyncPoint_[proxyMachineId] = reply.maxsyncedlogid();
          fastReplySender_[id]->SendMsgTo(*addr, reply,
                                          MessageType::FAST_REPLY);
          // replyLogQu_.enqueue(reply);
        } else {
          // The follower already gets some synced non-commutative logs (via
          // index sync process)

          // Log entries up to syncedEntry are all synced
          // syncedEntry->hash represents them
          if (entry->LessOrEqual(*syncedEntry)) {
            // No need to send fast replies, because this entry has already
            // been covered by index sync process, just give it a dummy reply,
            // which includes the max-synced-log-id
            uint32_t proxyMachineId = HIGH_32BIT(entry->body.proxyId);
            if (repliedSyncPoint_[proxyMachineId] <
                maxSyncedLogEntry_.load()->logId) {
              reply.set_clientid(0);
              reply.set_reqid(0);
              reply.set_logid(0);
              reply.set_maxsyncedlogid(maxSyncedLogEntry_.load()->logId);
              fastReplySender_[id]->SendMsgTo(*addr, reply,
                                              MessageType::FAST_REPLY);
            }
          } else {
            // Beyond syncedEntry, we need to find the boundary in the unsynced
            // logs
            // TODO: Check the following
            // Since unsyncedLogId is no fresher (maybe older) than syncedLogId,
            // then unsyncedEntry may have already been surpasssed by
            // syncedEntry, we need to remove the (potential) overlap

            while (unsyncedEntry->LessOrEqual(*syncedEntry)) {
              if (unsyncedEntry->body.isWrite) {
                if (unsyncedEntry->nextNonCommutative) {
                  unsyncedEntry = unsyncedEntry->nextNonCommutative;
                } else {
                  break;
                }
              } else {
                if (unsyncedEntry->nextNonCommutative) {
                  unsyncedEntry = unsyncedEntry->nextNonCommutative;
                } else {
                  break;
                }
              }
            }
            // LogStruct log;
            // log.originalHash = hash.toString();
            // hash encodes all the (unsynced) entries up to entry
            hash.XOR(unsyncedEntry->logHash);  // Remove all previous hash
            // before unsyncedEntry [included]
            // log.unsynced = unsyncedEntry;
            // log.addback = false;
            if (syncedEntry->LessThan(*unsyncedEntry)) {
              // add itself back (read request is 0)
              hash.XOR(unsyncedEntry->entryHash);
              // log.addback = true;
            }
            // Now hash only encodes [unsyncedEntry, entry]
            // Let's add the synced part
            // log.synced = syncedEntry;
            hash.XOR(syncedEntry->logHash);
            // log.finalE = entry;
            // entryQu_.enqueue(log);
            reply.set_hash(hash.hash, SHA_DIGEST_LENGTH);
            reply.set_maxsyncedlogid(maxSyncedLogEntry_.load()->logId);
            uint32_t proxyMachineId = HIGH_32BIT(entry->body.proxyId);
            repliedSyncPoint_[proxyMachineId] = reply.maxsyncedlogid();
            fastReplySender_[id]->SendMsgTo(*addr, reply,
                                            MessageType::FAST_REPLY);
            // replyLogQu_.enqueue(reply);
          }
        }
      }
    }
  }
  uint32_t preVal = activeWorkerNum_.fetch_sub(1);
  LOG(INFO) << "Fast Reply Terminated " << preVal - 1 << " worker remaining";
}

void Replica::SlowReplyThread(int id) {
  activeWorkerNum_.fetch_add(1);
  Reply reply;
  reply.set_replicaid(replicaId_);
  reply.set_hash("");
  // uint32_t replyNum = 0;
  // uint64_t startTime, endTime;
  while (status_ != ReplicaStatus::TERMINATED) {
    BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
    if (AmLeader()) {
      // Leader does not send slow replies
      nanosleep((const struct timespec[]){{0, 1000000L}}, NULL);
      continue;
    }
    LogEntry* entry = NULL;
    if (slowReplyQu_[id].try_dequeue(entry)) {
      uint32_t logId = entry->logId;
      reply.set_view(viewId_);
      reply.set_clientid((entry->body.reqKey) >> 32);
      reply.set_reqid((uint32_t)(entry->body.reqKey));
      // Optimize: SLOW_REPLY => COMMIT_REPLY
      if (logId <= committedLogId_) {
        reply.set_replytype(MessageType::COMMIT_REPLY);
        reply.set_result(entry->result);
      } else {
        reply.set_replytype(MessageType::SLOW_REPLY);
        reply.set_result("");
      }
      reply.set_owd(owdMap_.get(entry->body.proxyId));
      reply.set_maxsyncedlogid(maxSyncedLogEntry_.load()->logId);

      Address* addr = proxyAddressMap_.get(entry->body.proxyId);
      if (addr) {
        slowReplySender_[id]->SendMsgTo(*addr, reply, MessageType::SLOW_REPLY);
      }
      // replyNum++;
      // if (replyNum == 1) {
      //   startTime = GetMicrosecondTimestamp();
      // } else if (replyNum % 100000 == 0) {
      //   endTime = GetMicrosecondTimestamp();
      //   float rate = 100000 / ((endTime - startTime) * 1e-6);
      //   LOG(INFO) << "id=" << id << "\t Slow Reply Rate=" << rate
      //             << "\t QuLen=" << slowReplyQu_[id].size_approx() << "\t"
      //             << "pendingIndexSync_ qu =" << pendingIndexSync_.size();
      //   startTime = endTime;
      // }
    }
  }
  uint32_t preVal = activeWorkerNum_.fetch_sub(1);
  LOG(INFO) << "SlowReplyThread Terminated " << preVal - 1
            << " worker remaining ";
}

void Replica::IndexSendThread(int id, int cvId) {
  activeWorkerNum_.fetch_add(1);
  LogEntry* lastSyncedEntry = syncedLogEntryHead_;
  IndexSync indexSyncMsg;
  uint32_t syncPeriod = replicaConfig_.indexSyncPeriodUs;
  struct timespec sleepIntval({0, syncPeriod * 1000});
  while (status_ != ReplicaStatus::TERMINATED) {
    BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
    if (!AmLeader()) {
      // Although this replica is not leader currently,
      // we still keep this thread. When it becomes the leader
      // we can immediately use the thread instead of launching extra threads
      // (slowly)
      nanosleep((const struct timespec[]){{0, 1000000L}}, NULL);
      continue;
    }
    if (maxSyncedLogEntry_ == NULL) {
      continue;
    }

    // (1) Leader has some indices to sync
    // (2) There is noting to send, but we still send an indexSync msg every
    // 10ms (to serve as leader's heartbeat)
    indexSyncMsg.set_view(viewId_);
    indexSyncMsg.set_logidbegin(lastSyncedEntry->logId + 1);
    uint32_t logEnd = maxSyncedLogEntry_.load()->logId;
    logEnd = std::min(indexSyncMsg.logidbegin() + indexTransferBatch_, logEnd);
    indexSyncMsg.set_logidend(logEnd);
    indexSyncMsg.clear_deadlines();
    indexSyncMsg.clear_reqkeys();
    for (uint32_t i = indexSyncMsg.logidbegin(); i <= indexSyncMsg.logidend();
         i++) {
      LogEntry* entry = lastSyncedEntry->next;
      ASSERT(entry != NULL);
      ASSERT(entry->logId == i);
      indexSyncMsg.add_deadlines(entry->body.deadline);
      indexSyncMsg.add_reqkeys(entry->body.reqKey);
      lastSyncedEntry = entry;
    }

    indexSyncMsg.set_sendtime(GetMicrosecondTimestamp());
    // Send to all followers
    for (uint32_t r = 0; r < replicaNum_; r++) {
      if (r != replicaId_) {
        indexSender_[id]->SendMsgTo(*(indexReceiver_[r]), indexSyncMsg,
                                    MessageType::SYNC_INDEX);
      }
    }

    nanosleep(&sleepIntval, NULL);
  }
  uint32_t preVal = activeWorkerNum_.fetch_sub(1);
  LOG(INFO) << "IndexSendThread Terminated " << preVal - 1
            << " worker remaining";
}

void Replica::IndexRecvThread() {
  activeWorkerNum_.fetch_add(1);
  while (status_ != ReplicaStatus::TERMINATED) {
    BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
    indexSyncContext_->Register(endPointType_);
    indexSyncContext_->endPoint_->LoopRun();
  }
  uint32_t preVal = activeWorkerNum_.fetch_sub(1);
  LOG(INFO) << "IndexRecvThread Terminated " << preVal - 1
            << " worker remaining";
}

void Replica::ReceiveIndexSyncMessage(MessageHeader* msgHdr, char* msgBuffer) {
  // Promise to the GarbageCollectThread, that I will not use the data before
  // safeToClearLateBufferLogId_ and safeToClearUnSyncedLogId_, so that
  // GarbageCollectThread can safely reclaim them
  safeToClearLateBufferLogId_.store(prepareToClearLateBufferLogId_.load());
  safeToClearUnSyncedLogId_[fastReplyQu_.size()].store(
      prepareToClearUnSyncedLogId_.load());

  MessageHeader* newMsgHdr = new MessageHeader(msgHdr->msgType, msgHdr->msgLen);
  char* newBuffer = new char[msgHdr->msgLen];
  memcpy(newBuffer, msgBuffer, msgHdr->msgLen);

  indexQu_.enqueue({newMsgHdr, newBuffer});
}

void Replica::IndexProcessThread() {
  activeWorkerNum_.fetch_add(1);
  std::pair<MessageHeader*, char*> ele;
  while (status_ != ReplicaStatus::TERMINATED) {
    BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
    while (indexQu_.try_dequeue(ele)) {
      MessageHeader* msgHdr = ele.first;
      char* msgBuffer = ele.second;
      if (msgHdr->msgType == MessageType::SYNC_INDEX) {
        IndexSync idxSyncMsg;
        if (idxSyncMsg.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
          if (!CheckView(idxSyncMsg.view(), false)) {
            delete msgHdr;
            delete[] msgBuffer;
            break;
          }

          lastHeartBeatTime_ = GetMicrosecondTimestamp();
          if (idxSyncMsg.logidbegin() > idxSyncMsg.logidend()) {
            // Pure heart beat
            continue;
          }

          if (idxSyncMsg.logidend() > maxSyncedLogEntry_.load()->logId) {
            std::pair<uint32_t, uint32_t> key(idxSyncMsg.logidbegin(),
                                              idxSyncMsg.logidend());
            pendingIndexSync_[key] = idxSyncMsg;
          }
          // Process pendingIndexSync, if any
          while (!pendingIndexSync_.empty()) {
            if (ProcessIndexSync(pendingIndexSync_.begin()->second)) {
              pendingIndexSync_.erase(pendingIndexSync_.begin());
            } else {
              break;
            }
          }
        }
      } else if (msgHdr->msgType == MessageType::MISSED_REQ) {
        MissedReq missedReqMsg;
        if (missedReqMsg.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
          for (int i = 0; i < missedReqMsg.reqs().size(); i++) {
            const RequestBodyMsg& rbMsg = missedReqMsg.reqs(i);
            if (missedReqKeys_.find(rbMsg.reqkey()) != missedReqKeys_.end()) {
              RequestBody* rb = new RequestBody(
                  rbMsg.deadline(), rbMsg.reqkey(), rbMsg.key(),
                  rbMsg.proxyid(), rbMsg.command(), rbMsg.iswrite());
              // We must handle it to ProcessThread instead of processing it
              // here, to avoid data race (and further memroy leakage), although
              // it is a trivial possibility
              uint32_t quId = rbMsg.reqkey() % recordQu_.size();
              recordQu_[quId].enqueue(rb);

              missedReqKeys_.erase(rbMsg.reqkey());
              fetchTime_.push_back(GetMicrosecondTimestamp() -
                                   askTimebyReqKey_[rbMsg.reqkey()]);
              askTimebyReqKey_.erase(rbMsg.reqkey());
            }
          }
        }
      } else {
        LOG(WARNING) << "Unexpected msg type " << (int)(msgHdr->msgType);
      }
      delete msgHdr;
      delete[] msgBuffer;
    }
  }
  uint32_t preVal = activeWorkerNum_.fetch_sub(1);
  LOG(INFO) << "IndexProcessThread Terminated: " << preVal - 1
            << " worker remaining";
}

bool Replica::ProcessIndexSync(const IndexSync& idxSyncMsg) {
  uint32_t maxSyncedLogId = maxSyncedLogEntry_.load()->logId;
  if (idxSyncMsg.logidend() <= maxSyncedLogId) {
    // This idxSyncMsg is useless
    return true;
  }
  if (idxSyncMsg.logidbegin() > maxSyncedLogId + 1) {
    // Missing some indices
    missedIndices_ = {maxSyncedLogId + 1, idxSyncMsg.logidbegin() - 1};
    AskMissedIndex();
    return false;
  }

  // Coming here means, no index is missing
  if (indexSyncContext_->endPoint_->isTimerRegistered(indexAskTimer_)) {
    indexSyncContext_->endPoint_->UnRegisterTimer(indexAskTimer_);
  }

  for (uint32_t logId = maxSyncedLogId + 1; logId <= idxSyncMsg.logidend();
       logId++) {
    uint32_t offset = logId - idxSyncMsg.logidbegin();
    uint64_t reqKey = idxSyncMsg.reqkeys(offset);
    uint64_t deadline = idxSyncMsg.deadlines(offset);
    uint32_t quId = reqKey % recordMap_.size();
    LogEntry* entry = recordMap_[quId].get(reqKey);
    if (entry && missedReqKeys_.empty()) {
      SHA_HASH myHash;
      SHA_HASH hash;
      if (entry->body.isWrite) {
        myHash = CalculateHash(deadline, reqKey);
        hash = myHash;
      }
      LogEntry* prevNonCommutative = maxSyncedLogEntryByKey_[entry->body.opKey];
      LogEntry* prevNonCommutativeWrite = NULL;
      if (prevNonCommutative) {
        if (prevNonCommutative->body.isWrite) {
          prevNonCommutativeWrite = prevNonCommutative;
        } else {
          prevNonCommutativeWrite = prevNonCommutative->prevNonCommutativeWrite;
        }
      }
      assert(prevNonCommutativeWrite == NULL ||
             prevNonCommutativeWrite->body.isWrite);
      if (prevNonCommutativeWrite) {
        // This request has some pre non-commutative ones
        // In that way, XOR the previous accumulated hash
        hash.XOR(prevNonCommutativeWrite->logHash);
      }
      LogEntry* newEntry =
          new LogEntry(entry->body, myHash, hash, prevNonCommutative, NULL,
                       prevNonCommutativeWrite, NULL, maxSyncedLogEntry_, NULL);
      newEntry->status = EntryStatus::TO_SLOW_REPLY;
      newEntry->logId = logId;
      ASSERT(logId == maxSyncedLogEntry_.load()->logId + 1);
      maxSyncedLogEntry_.load()->next = newEntry;
      if (prevNonCommutative) {
        prevNonCommutative->nextNonCommutative = newEntry;
      }
      if (newEntry->body.isWrite && prevNonCommutativeWrite) {
        prevNonCommutativeWrite->nextNonCommutativeWrite = newEntry;
      }
      // uint32_t prevMaxLogId = maxSyncedLogEntry_.load()->logId;
      maxSyncedLogEntry_ = newEntry;
      ASSERT(maxSyncedLogEntry_.load()->logId == logId);
      ASSERT(prevMaxLogId + 1 == logId);

      maxSyncedLogEntryByKey_[newEntry->body.opKey] = newEntry;
      uint32_t quId = (newEntry->body.reqKey) % slowReplyQu_.size();
      slowReplyQu_[quId].enqueue(newEntry);

      ASSERT(newEntry->prev->logId + 1 == newEntry->logId);
      // TODO： Think about the order above

      // Chunk UnSynced logs
      if (minUnSyncedLogEntryByKey_[newEntry->body.opKey]) {
        // Try to advance  minUnSyncedLogIdByKey_[opKey]
        LogEntry* unSyncedEntry =
            minUnSyncedLogEntryByKey_[newEntry->body.opKey];

        while (unSyncedEntry->LessOrEqual(*entry)) {
          if (unSyncedEntry->body.isWrite) {
            if (unSyncedEntry->nextNonCommutativeWrite) {
              unSyncedEntry = unSyncedEntry->nextNonCommutativeWrite;
            } else {
              break;
            }
          } else {
            if (unSyncedEntry->nextNonCommutative) {
              unSyncedEntry = unSyncedEntry->nextNonCommutative;
            } else {
              break;
            }
          }
        }
        minUnSyncedLogEntryByKey_[newEntry->body.opKey] = unSyncedEntry;
      }
    } else {
      missedReqKeys_.insert(reqKey);
    }
  }
  if (missedReqKeys_.empty()) {
    return true;
  } else {
    AskMissedRequest();
    return false;
  }
}

void Replica::MissedIndexAckThread() {
  activeWorkerNum_.fetch_add(1);
  while (status_ != ReplicaStatus::TERMINATED) {
    BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
    missedIndexAckContext_->Register(endPointType_);
    missedIndexAckContext_->endPoint_->LoopRun();
  }
  uint32_t preVal = activeWorkerNum_.fetch_sub(1);
  LOG(INFO) << "MissedIndexAckThread Terminated " << preVal - 1
            << " worker remaining";
}

void Replica::ReceiveAskMissedIdx(MessageHeader* msgHdr, char* msgBuffer) {
  AskIndex askIndex;
  if (msgHdr->msgType == MessageType::MISSED_INDEX_ASK &&
      askIndex.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
    uint32_t logBegin = askIndex.logidbegin();
    uint32_t logEnd =
        std::min(maxSyncedLogEntry_.load()->logId, askIndex.logidend());
    for (uint32_t i = logBegin; i <= logEnd; i += indexTransferBatch_) {
      IndexSync indexSyncMsg;
      indexSyncMsg.set_view(viewId_);
      indexSyncMsg.set_logidbegin(i);
      uint32_t end = std::min(i + indexTransferBatch_ - 1, logEnd);
      indexSyncMsg.set_logidend(end);
      uint32_t logid = i;
      LogEntry* entryStart = syncedLogEntryByLogId_.get(logid);
      if (!entryStart) {
        // Since the update of syncedLogEntryByLogId_ may lag a bit behind
        // maxSyncedLogEntry_. entryStart may be NULL. In that case, we
        // terminate here
        break;
      }

      ASSERT(entryStart->logId == logid);
      while (entryStart->logId <= end) {
        indexSyncMsg.add_deadlines(entryStart->body.deadline);
        indexSyncMsg.add_reqkeys(entryStart->body.reqKey);
        entryStart = entryStart->next;
      }
      indexAcker_->SendMsgTo(*(indexReceiver_[askIndex.replicaid()]),
                             indexSyncMsg, MessageType::SYNC_INDEX);
    }
  }
}

void Replica::MissedReqAckThread() {
  activeWorkerNum_.fetch_add(1);
  while (status_ != ReplicaStatus::TERMINATED) {
    BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
    missedReqAckContext_->Register(endPointType_);
    missedReqAckContext_->endPoint_->LoopRun();
  }
  uint32_t preVal = activeWorkerNum_.fetch_sub(1);
  LOG(INFO) << "MissedReqAckThread Terminated " << preVal - 1
            << " worker remaining";
}

void Replica::ReceiveAskMissedReq(MessageHeader* msgHdr, char* msgBuffer) {
  AskReq askReqMsg;
  if (msgHdr->msgType == MessageType::MISSED_REQ_ASK &&
      askReqMsg.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
    MissedReq missedReqMsg;
    missedReqMsg.set_replicaid(this->replicaId_);
    for (int i = 0; i < askReqMsg.missedreqkeys_size(); i++) {
      uint64_t reqKey = askReqMsg.missedreqkeys(i);
      uint32_t quId = reqKey % recordMap_.size();
      LogEntry* entry = recordMap_[quId].get(reqKey);
      if (entry) {
        RequestBodyToMessage(entry->body, missedReqMsg.add_reqs());
      }
      if ((uint32_t)(missedReqMsg.reqs_size()) >= requestTrasnferBatch_) {
        missedReqAckContext_->endPoint_->SendMsgTo(
            *(indexReceiver_[askReqMsg.replicaid()]), missedReqMsg,
            MessageType::MISSED_REQ);
        missedReqMsg.clear_reqs();
      }
    }

    if (missedReqMsg.reqs_size() > 0) {
      // This ack is useful because it really contains some missed requests,
      // so send it
      missedReqAckContext_->endPoint_->SendMsgTo(
          *(indexReceiver_[askReqMsg.replicaid()]), missedReqMsg,
          MessageType::MISSED_REQ);
    }
  }
}

void Replica::RequestBodyToMessage(const RequestBody& rb,
                                   RequestBodyMsg* rbMsg) {
  rbMsg->set_deadline(rb.deadline);
  rbMsg->set_reqkey(rb.reqKey);
  rbMsg->set_proxyid(rb.proxyId);
  rbMsg->set_command(rb.command);
  rbMsg->set_key(rb.opKey);
  rbMsg->set_iswrite(rb.isWrite);
}

void Replica::AskMissedIndex() {
  if (missedIndices_.first > missedIndices_.second) {
    // indexSyncContext_->endPoint_->UnRegisterTimer(indexAskTimer_);
    return;
  }
  uint64_t nowTime = GetMicrosecondTimestamp();
  if (lastAskMissedIndexTime_ + 50 > nowTime) {
    return;
  }
  AskIndex askIndexMsg;
  askIndexMsg.set_replicaid(this->replicaId_);
  askIndexMsg.set_logidbegin(missedIndices_.first);
  askIndexMsg.set_logidend(missedIndices_.second);

  // roundRobinIndexAskIdx_ = 0;// Debug

  // Do not ask leader every time, choose random replica to ask to avoid
  // leader bottleneck
  indexRequester_->SendMsgTo(
      *(indexAskReceiver_[roundRobinIndexAskIdx_ % replicaNum_]), askIndexMsg,
      MessageType::MISSED_INDEX_ASK);
  roundRobinIndexAskIdx_++;
  if (roundRobinIndexAskIdx_ % replicaNum_ == replicaId_) {
    roundRobinIndexAskIdx_++;
  }
  lastAskMissedIndexTime_ = GetMicrosecondTimestamp();
}

void Replica::AskMissedRequest() {
  if (missedReqKeys_.empty()) {
    // no need to start timer
    return;
  }
  uint64_t nowTime = GetMicrosecondTimestamp();
  if (lastAskMissedIndexTime_ + 50 > nowTime) {
    return;
  }
  AskReq askReqMsg;
  askReqMsg.set_replicaid(this->replicaId_);
  for (const uint64_t& reqKey : missedReqKeys_) {
    askReqMsg.add_missedreqkeys(reqKey);
    if ((uint32_t)(askReqMsg.missedreqkeys_size()) >=
        requestKeyTransferBatch_) {
      reqRequester_->SendMsgTo(
          *(requestAskReceiver_[roundRobinRequestAskIdx_ % replicaNum_]),
          askReqMsg, MessageType::MISSED_REQ_ASK);
      roundRobinRequestAskIdx_++;
      if (roundRobinRequestAskIdx_ % replicaNum_ == replicaId_) {
        roundRobinRequestAskIdx_++;
      }
      askReqMsg.clear_missedreqkeys();
    }
    askTimebyReqKey_[reqKey] = GetMicrosecondTimestamp();
  }
  if (askReqMsg.missedreqkeys_size() > 0) {
    reqRequester_->SendMsgTo(*(requestAskReceiver_[viewId_ % replicaNum_]),
                             askReqMsg, MessageType::MISSED_REQ_ASK);

    roundRobinRequestAskIdx_++;
    if (roundRobinRequestAskIdx_ % replicaNum_ == replicaId_) {
      roundRobinRequestAskIdx_++;
    }
    lastAskMissedRequestTime_ = GetMicrosecondTimestamp();
  }
}

void Replica::GarbageCollectThread() {
  activeWorkerNum_.fetch_add(1);
  while (status_ != ReplicaStatus::TERMINATED) {
    BlockWhenStatusIsNot(ReplicaStatus::NORMAL);
    // Reclaim stale crashVector
    ReclaimStaleCrashVector();
    // Reclaim (unsynced) stale logs
    ReclaimStaleLogs();
    // Check LateBuffer and UnSyncedLog items and try to advance
    // prepareToClearLateBufferLogId_ and prepareToClearUnSyncedLogId_
    PrepareNextReclaim();
  }
  uint32_t preVal = activeWorkerNum_.fetch_sub(1);
  LOG(INFO) << "GarbageCollectThread Terminated " << preVal - 1
            << " worker remaining";
}

void Replica::ReclaimStaleCrashVector() {
  uint32_t masterCVVersion = crashVectorInUse_[0].load()->version_;
  while (cvVersionToClear_ <= masterCVVersion) {
    bool canDelete = true;
    for (uint32_t i = 0; i < crashVectorVecSize_; i++) {
      if (crashVectorInUse_[i].load()->version_ <= cvVersionToClear_) {
        canDelete = false;
        break;
      }
    }
    if (canDelete) {
      CrashVectorStruct* cvToClear = crashVector_.get(cvVersionToClear_);
      crashVector_.erase(cvVersionToClear_);
      delete cvToClear;
      cvVersionToClear_++;
    } else {
      break;
    }
  }
}

void Replica::ReclaimStaleLogs() {
  uint32_t safePoint = prepareToClearUnSyncedLogId_;

  for (uint32_t shardIdx = 0; shardIdx < fastReplyQu_.size() + 1; shardIdx++) {
    safePoint = std::min(safePoint, safeToClearUnSyncedLogId_[shardIdx].load());
  }
  // Reclaim UnSynced Entries

  // Reclaim Entries in late-buffer
  safePoint = safeToClearLateBufferLogId_;
}

void Replica::PrepareNextReclaim() {}

void Replica::CheckHeartBeat() {
  if (status_ == ReplicaStatus::TERMINATED) {
    masterContext_->endPoint_->LoopBreak();
    return;
  }
  if (AmLeader()) {
    return;
  }
  if (status_ != ReplicaStatus::NORMAL) {
    // Some worker threads have detected viewchange and switch status_ to
    // VIEWCHANGE But workers have no priviledge to increment viewId_ and
    // initiate view change process, so the master will do that
    VLOG(2) << "InitiateViewChange-10";
    InitiateViewChange(viewId_ + 1);
    return;
  }
  uint64_t nowTime = GetMicrosecondTimestamp();
  uint64_t threashold = replicaConfig_.heartbeatThresholdMs * 1000;

  if (lastHeartBeatTime_ + threashold < nowTime) {
    // I haven't heard from the leader for too long, it probably has died
    // Before start view change, clear context
    VLOG(2) << "InitiateViewChange-1";
    InitiateViewChange(viewId_ + 1);
  }
}

void Replica::ReceiveMasterMessage(MessageHeader* msgHdr, char* msgBuffer) {
  VLOG(4) << "msgType " << (uint32_t)(msgHdr->msgType);

  if (msgHdr->msgType == MessageType::VIEWCHANGE_REQ) {
    ViewChangeRequest viewChangeReq;
    if (viewChangeReq.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
      ProcessViewChangeReq(viewChangeReq);
    }

  } else if (msgHdr->msgType == MessageType::VIEWCHANGE_MSG) {
    ViewChange viewChangeMsg;
    if (viewChangeMsg.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
      ProcessViewChange(viewChangeMsg);
    }

  } else if (msgHdr->msgType == MessageType::STATE_TRANSFER_REQUEST) {
    StateTransferRequest stateTransferReq;
    if (stateTransferReq.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
      ProcessStateTransferRequest(stateTransferReq);
    }
  } else if (msgHdr->msgType == MessageType::STATE_TRANSFER_REPLY) {
    StateTransferReply stateTransferRep;
    if (stateTransferRep.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
      ProcessStateTransferReply(stateTransferRep);
    }
  } else if (msgHdr->msgType == MessageType::START_VIEW) {
    StartView startView;
    if (startView.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
      ProcessStartView(startView);
    }
  } else if (msgHdr->msgType == MessageType::CRASH_VECTOR_REQUEST) {
    CrashVectorRequest request;
    if (request.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
      ProcessCrashVectorRequest(request);
    }
  } else if (msgHdr->msgType == MessageType::CRASH_VECTOR_REPLY) {
    CrashVectorReply reply;
    if (reply.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
      VLOG(2) << "CrashVectorReply = " << reply.DebugString();
      ProcessCrashVectorReply(reply);
    }
  } else if (msgHdr->msgType == MessageType::RECOVERY_REQUEST) {
    RecoveryRequest request;
    if (request.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
      ProcessRecoveryRequest(request);
    }
  } else if (msgHdr->msgType == MessageType::RECOVERY_REPLY) {
    RecoveryReply reply;
    if (reply.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
      ProcessRecoveryReply(reply);
    }
  } else if (msgHdr->msgType == MessageType::SYNC_STATUS_REPORT) {
    SyncStatusReport report;
    if (report.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
      ProcessSyncStatusReport(report);
    }
  } else if (msgHdr->msgType == MessageType::COMMIT_INSTRUCTION) {
    CommitInstruction commit;
    if (commit.ParseFromArray(msgBuffer, msgHdr->msgLen)) {
      ProcessCommitInstruction(commit);
    }
  }

  else {
    LOG(WARNING) << "Unexpected message type " << (int)msgBuffer[0];
  }
}

void Replica::SendViewChangeRequest(const int toReplicaId) {
  ViewChangeRequest viewChangeReq;
  viewChangeReq.set_view(viewId_);
  viewChangeReq.set_replicaid(replicaId_);
  CrashVectorStruct* cv = crashVectorInUse_[0].load();
  viewChangeReq.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end());

  if (toReplicaId < 0) {
    // send to all
    for (uint32_t i = 0; i < replicaNum_; i++) {
      if (i != replicaId_) {
        // no need to send to myself
        masterContext_->endPoint_->SendMsgTo(
            *(masterReceiver_[i]), viewChangeReq, MessageType::VIEWCHANGE_REQ);
      }
    }
  } else {
    masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[toReplicaId]),
                                         viewChangeReq,
                                         MessageType::VIEWCHANGE_REQ);
  }
}

void Replica::SendViewChange() {
  if (AmLeader()) {
    // I am the leader of this new view, no need to send to myself
    return;
  }

  ViewChange viewChangeMsg;
  viewChangeMsg.set_view(viewId_);
  viewChangeMsg.set_replicaid(replicaId_);
  CrashVectorStruct* cv = crashVectorInUse_[0].load();
  viewChangeMsg.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end());
  viewChangeMsg.set_syncpoint(maxSyncedLogEntry_.load()->logId);
  if (filteredUnSyncedEntries_.size() > 1) {
    viewChangeMsg.set_unsynclogbegin(1);
    viewChangeMsg.set_unsynclogend(filteredUnSyncedEntries_.size() - 1);
  } else {
    viewChangeMsg.set_unsynclogbegin(0);
    viewChangeMsg.set_unsynclogend(0);
  }

  viewChangeMsg.set_lastnormalview(lastNormalView_);
  masterContext_->endPoint_->SendMsgTo(
      *(masterReceiver_[viewId_ % replicaNum_]), viewChangeMsg,
      MessageType::VIEWCHANGE_MSG);
}

void Replica::InitiateViewChange(const uint32_t view) {
  if (viewId_ > view) {
    LOG(ERROR) << "Invalid view change initiation currentView=" << viewId_
               << "\ttargetView=" << view;
    return;
  }

  if (viewId_ == view && status_ == ReplicaStatus::VIEWCHANGE) {
    // Already in viewchange
    return;
  }

  status_ = ReplicaStatus::VIEWCHANGE;
  LOG(INFO) << "status =" << (int)status_ << "\t"
            << " view=" << viewId_ << "\t"
            << " targeting view=" << view;

  // Wait until every worker stop
  while (activeWorkerNum_ > 0) {
    usleep(1000);
  }

  /** Since the update of syncedLogEntryByReqKey_ and syncedLogEntryByLogId_
   * may have not been completed when they encounter view change, let's first
   * complete (flush) them */
  LogEntry* minTrackedEntry = trackedEntry_[0];
  for (uint32_t i = 0; i < trackedEntry_.size(); i++) {
    if (minTrackedEntry->logId > trackedEntry_[i]->logId) {
      minTrackedEntry = trackedEntry_[i];
    }
  }
  while (minTrackedEntry->next) {
    LogEntry* next = minTrackedEntry->next;
    if (syncedLogEntryByLogId_.get(next->logId) == NULL) {
      syncedLogEntryByLogId_.assign(next->logId, next);
      syncedLogEntryByReqKey_.assign(next->body.reqKey, next);
    }
    minTrackedEntry = next;
  }
  trackedEntry_.assign(trackedEntry_.size(), minTrackedEntry);

  LogEntry* entryStart = minUnSyncedLogEntry_;
  if (entryStart->logId < CONCURRENT_MAP_START_INDEX) {
    // This is dummy, move to its next;
    entryStart = entryStart->next;
  }
  filteredUnSyncedEntries_.clear();
  filteredUnSyncedEntries_.resize(
      1);  // Reserve 1 slot as dummy value [because 0 has special use]
  while (entryStart) {
    LogEntry* entry = syncedLogEntryByReqKey_.get(entryStart->body.reqKey);
    if (!entry) {
      // Has not been synced
      filteredUnSyncedEntries_.push_back(entryStart);
    }
    entryStart = entryStart->next;
  }

  viewId_ = view;
  // Unregister all timers, except the monitorTimer (so as the master thread
  // can break when status=Terminated)
  masterContext_->endPoint_->UnRegisterAllTimers();
  masterContext_->endPoint_->RegisterTimer(masterContext_->monitorTimer_);
  LOG(INFO) << "Monitor Timer Registered "
            << "viewId=" << viewId_ << "\t"
            << "maxSyncedLogId=" << maxSyncedLogEntry_.load()->logId << "\t"
            << "committedLogId=" << committedLogId_ << "\t"
            << "filteredUnSyncedEntries_.size()="
            << filteredUnSyncedEntries_.size() << "\t"
            << "currentTime=" << GetMicrosecondTimestamp() << "\t";
  // Launch viewChange timer
  masterContext_->endPoint_->RegisterTimer(viewChangeTimer_);
}

void Replica::BroadcastViewChange() {
  if (status_ == ReplicaStatus::NORMAL) {
    // Can stop the timer
    masterContext_->endPoint_->UnRegisterTimer(viewChangeTimer_);
    return;
  }
  // Broadcast VIEW-CHANGE-REQ to all replicas
  SendViewChangeRequest(-1);
  // Send VIEW-CHANGE to the leader in this view
  SendViewChange();
}

void Replica::SendStartView(const int toReplicaId) {
  StartView startView;
  startView.set_replicaid(replicaId_);
  startView.set_view(viewId_);
  CrashVectorStruct* cv = crashVectorInUse_[0];
  startView.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end());
  // startView.set_syncedlogid(maxSyncedLogId_);
  startView.set_syncedlogid(maxSyncedLogEntry_.load()->logId);
  if (toReplicaId >= 0) {
    // send to one
    masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[toReplicaId]),
                                         startView, MessageType::START_VIEW);
  } else {
    // send to all
    for (uint32_t i = 0; i < replicaNum_; i++) {
      if (i == replicaId_) {
        // No need to send to self
        continue;
      }
      masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[i]), startView,
                                           MessageType::START_VIEW);
      VLOG(2) << "Send StartView to " << i << "\t"
              << masterReceiver_[i]->GetIPAsString() << ":"
              << masterReceiver_[i]->GetPortAsInt();
    }
  }
}

void Replica::SendSyncStatusReport() {
  SyncStatusReport report;
  report.set_view(viewId_);
  report.set_replicaid(replicaId_);
  CrashVectorStruct* cv = crashVectorInUse_[0].load();
  report.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end());
  // report.set_syncedlogid(maxSyncedLogId_);
  report.set_syncedlogid(maxSyncedLogEntry_.load()->logId);
  if (AmLeader()) {
    // leader directly process its own report
    ProcessSyncStatusReport(report);
  } else {
    // send to leader
    masterContext_->endPoint_->SendMsgTo(
        *(masterReceiver_[viewId_ % replicaNum_]), report,
        MessageType::SYNC_STATUS_REPORT);
  }
}

void Replica::SendCommit() {
  CommitInstruction commit;
  commit.set_view(viewId_);
  commit.set_replicaid(replicaId_);
  CrashVectorStruct* cv = crashVectorInUse_[0].load();
  commit.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end());
  commit.set_committedlogid(committedLogId_);
  // LOG(INFO) << "commit " << commit.DebugString();
  for (uint32_t i = 0; i < replicaNum_; i++) {
    if (i != replicaId_) {
      masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[i]), commit,
                                           MessageType::COMMIT_INSTRUCTION);
    }
  }
}

void Replica::ProcessViewChangeReq(const ViewChangeRequest& viewChangeReq) {
  if (status_ == ReplicaStatus::RECOVERING) {
    // Recovering replicas do not participate in view change
    return;
  }
  if (!CheckCV(viewChangeReq.replicaid(), viewChangeReq.cv())) {
    // stray message
    return;
  }
  if (Aggregated(viewChangeReq.cv())) {
    // If cv is updated, then it is likely that some messages in
    // viewChangeSet_ become stray, so remove them
    for (uint32_t i = 0; i < replicaNum_; i++) {
      auto iter = viewChangeSet_.find(i);
      if (iter != viewChangeSet_.end() && (!CheckCV(i, iter->second.cv()))) {
        viewChangeSet_.erase(i);
      }
    }
  }
  if (viewChangeReq.view() > viewId_) {
    VLOG(2) << "InitiateViewChange-2";
    InitiateViewChange(viewChangeReq.view());
  } else {
    if (status_ == ReplicaStatus::NORMAL) {
      SendStartView(viewChangeReq.replicaid());
    } else {
      SendViewChange();
    }
  }
}

void Replica::ProcessViewChange(const ViewChange& viewChange) {
  // LOG(INFO) << "viewChange: " << viewChange.DebugString();
  if (status_ == ReplicaStatus::RECOVERING) {
    // Recovering replicas do not participate in view change
    return;
  }
  if (!CheckCV(viewChange.replicaid(), viewChange.cv())) {
    // stray message
    LOG(WARNING) << "Stray Message";
    return;
  }

  Aggregated(viewChange.cv());

  if (status_ == ReplicaStatus::NORMAL) {
    if (viewChange.view() > viewId_) {
      VLOG(2) << "InitiateViewChange-3";
      InitiateViewChange(viewChange.view());
    } else {
      // The sender lags behind
      SendStartView(viewChange.replicaid());
    }
  } else if (status_ == ReplicaStatus::VIEWCHANGE) {
    if (viewChange.view() > viewId_) {
      VLOG(2) << "InitiateViewChange-4";
      InitiateViewChange(viewChange.view());
    } else if (viewChange.view() < viewId_) {
      SendViewChangeRequest(viewChange.replicaid());
    }
    // viewChange.view() == viewId
    else if (viewChangeSet_.size() >= replicaNum_ / 2 + 1) {
      // We have got enough valid viewchange messages, no need for this one
      return;
    } else {
      ASSERT(AmLeader());
      viewChangeSet_[viewChange.replicaid()] = viewChange;
      VLOG(3) << "viewChangeSet Size=" << viewChangeSet_.size();
      // If cv is updated, then it is likely that some messages in
      // viewChangeSet_ become stray, so remove them
      for (uint32_t i = 0; i < replicaNum_; i++) {
        auto iter = viewChangeSet_.find(i);
        if (iter != viewChangeSet_.end() && (!CheckCV(i, iter->second.cv()))) {
          viewChangeSet_.erase(i);
        }
      }
      if (viewChangeSet_.size() >= replicaNum_ / 2) {
        ASSERT(viewChangeSet_.find(replicaId_) == viewChangeSet_.end());
        // Got f viewChange
        // Plus myself, got f+1 viewChange messages
        ViewChange myvc;
        CrashVectorStruct* masterCV = crashVectorInUse_[0].load();
        myvc.mutable_cv()->Add(masterCV->cv_.begin(), masterCV->cv_.end());
        myvc.set_view(viewId_);
        myvc.set_replicaid(replicaId_);
        // myvc.set_syncpoint(maxSyncedLogId_);
        // myvc.set_unsynclogbegin(minUnSyncedLogId_);
        // myvc.set_unsynclogend(maxUnSyncedLogId_);
        myvc.set_syncpoint(maxSyncedLogEntry_.load()->logId);
        if (filteredUnSyncedEntries_.size() > 1) {
          myvc.set_unsynclogbegin(1);
          myvc.set_unsynclogend(filteredUnSyncedEntries_.size() - 1);
        } else {
          myvc.set_unsynclogbegin(0);
          myvc.set_unsynclogend(0);
        }

        myvc.set_lastnormalview(lastNormalView_);
        viewChangeSet_[replicaId_] = myvc;
        // Has got enough viewChange messages, stop viewChangeTimer
        masterContext_->endPoint_->UnRegisterTimer(viewChangeTimer_);
        TransferSyncedLog();
      }
    }
  } else {
    LOG(WARNING) << "Unexpected Status " << status_;
  }
}

void Replica::TransferSyncedLog() {
  uint32_t largestNormalView = lastNormalView_;
  uint32_t maxSyncedLogId = maxSyncedLogEntry_.load()->logId;
  uint32_t largestSyncPoint = maxSyncedLogId;
  uint32_t targetReplicaId = replicaId_;
  transferSyncedEntry_ = true;
  for (auto& kv : viewChangeSet_) {
    if (largestNormalView < kv.second.lastnormalview()) {
      largestNormalView = kv.second.lastnormalview();
    }
  }
  for (auto& kv : viewChangeSet_) {
    if (kv.second.lastnormalview() == largestNormalView &&
        largestSyncPoint < kv.second.syncpoint()) {
      largestSyncPoint = kv.second.syncpoint();
      targetReplicaId = kv.second.replicaid();
    }
  }

  stateTransferIndices_.clear();
  VLOG(3) << "maxSyncedLogId_=" << maxSyncedLogId << "\t"
          << "largestSyncPoint=" << largestSyncPoint << "\t"
          << "largestNormalView = " << largestNormalView << "\t"
          << "lastNormalView_=" << lastNormalView_;
  // Directly copy the synced entries
  if (largestNormalView == lastNormalView_) {
    if (maxSyncedLogId < largestSyncPoint) {
      stateTransferIndices_[targetReplicaId] = {maxSyncedLogId + 1,
                                                largestSyncPoint};
    }
    // Else: no need to do state transfer, because this replica has all synced
    // entries
  } else {
    stateTransferIndices_[targetReplicaId] = {committedLogId_ + 1,
                                              largestSyncPoint};
  }

  if (!stateTransferIndices_.empty()) {
    // Start state transfer
    // After this state transfer has been completed, continue to execute the
    // callback (MergeUnsyncedLog)

    stateTransferCallback_ = std::bind(&Replica::TransferUnSyncedLog, this);
    stateTransferTerminateTime_ =
        GetMicrosecondTimestamp() + stateTransferTimeout_ * 1000;
    stateTransferTerminateCallback_ =
        std::bind(&Replica::RollbackToViewChange, this);
    LOG(INFO) << "Start state transfer targetReplica " << targetReplicaId
              << "\t"
              << "seg=" << stateTransferIndices_[targetReplicaId].first << "\t"
              << stateTransferIndices_[targetReplicaId].second;
    // Start the state tranfer timer
    masterContext_->endPoint_->RegisterTimer(stateTransferTimer_);
  } else {
    // Directly go to the second stage: transfer unsynced log
    TransferUnSyncedLog();
  }
}

void Replica::TransferUnSyncedLog() {
  // Get the unsynced logs from the f+1 remaining replicas
  // If this process cannot be completed, rollback to view change
  uint32_t largestNormalView = lastNormalView_;
  transferSyncedEntry_ = false;
  for (auto& kv : viewChangeSet_) {
    if (largestNormalView < kv.second.lastnormalview()) {
      largestNormalView = kv.second.lastnormalview();
    }
  }
  VLOG(3) << "TransferUnSyncedLog largestNormalView=" << largestNormalView;

  stateTransferIndices_.clear();
  for (auto& kv : viewChangeSet_) {
    if (kv.second.lastnormalview() < largestNormalView) {
      // No need to transfer log, this guy's unsynced logs do not contribute
      // to committed logs
      continue;
    }
    if (kv.first == replicaId_) {
      // No need to transfer log entries from self
      continue;
    }
    if (kv.second.unsynclogbegin() == 0 && kv.second.unsynclogend() == 0) {
      // This replica has no unsynced logs
      continue;
    }
    // request transfer of the filteredUnSyncedRequests vec
    stateTransferIndices_[kv.first] = {kv.second.unsynclogbegin(),
                                       kv.second.unsynclogend()};
  }
  if (stateTransferIndices_.empty()) {
    // No need to do state transfer for unsynced logs
    // Directly go to new view
    EnterNewView();
    return;
  }
  // After this state transfer is completed, this replica will enter the new
  // view
  stateTransferCallback_ = std::bind(&Replica::MergeUnSyncedLog, this);
  stateTransferTerminateTime_ =
      GetMicrosecondTimestamp() + stateTransferTimeout_ * 1000;
  stateTransferTerminateCallback_ =
      std::bind(&Replica::RollbackToViewChange, this);
  masterContext_->endPoint_->RegisterTimer(stateTransferTimer_);
}

void Replica::MergeUnSyncedLog() {
  int f = replicaNum_ / 2;
  int quorum = (f % 2 == 0) ? (f / 2 + 1) : (f / 2 + 2);
  SHA_HASH dummy;
  for (auto& kv : requestsToMerge_) {
    uint64_t reqKey = kv.first.second;
    LogEntry* entry = kv.second.first;
    int count = kv.second.second;
    if (count >= quorum) {
      if (syncedLogEntryByReqKey_.get(reqKey)) {
        // at-most once
        delete entry;
        continue;
      }
      ProcessRequest(entry, true, false, true);
      syncedLogEntryByReqKey_.assign(reqKey, entry);
      syncedLogEntryByLogId_.assign(entry->logId, entry);
    }
  }
  requestsToMerge_.clear();
  EnterNewView();
}

void Replica::EnterNewView() {
  LOG(INFO) << "Enter New View " << viewId_
            << " maxSyncedLog =" << maxSyncedLogEntry_.load()->logId << "\t"
            << GetMicrosecondTimestamp();
  // Leader sends StartView to all the others
  if (AmLeader()) {
    SendStartView(-1);
  }  // Else: followers directly start

  status_ = ReplicaStatus::NORMAL;
  lastNormalView_.store(viewId_);
  // Update crashVector, all synced with master
  CrashVectorStruct* masterCV = crashVectorInUse_[0].load();
  for (uint32_t i = 1; i < crashVectorVecSize_; i++) {
    crashVectorInUse_[i] = masterCV;
  }
  crashVector_.assign(masterCV->version_, masterCV);

  // More lightweight than CreateContext
  ResetContext();
  // Notify the blocking workers until all workers become active
  while (activeWorkerNum_ < totalWorkerNum_) {
    waitVar_.notify_all();
    usleep(1000);
  }

  LOG(INFO) << "View=" << viewId_
            << " Recovered worker number:" << activeWorkerNum_;
}

void Replica::SendStateTransferRequest() {
  if (GetMicrosecondTimestamp() >= stateTransferTerminateTime_) {
    // If statetransfer cannot be completed within a certain amount of time,
    // rollback to view change
    masterContext_->endPoint_->UnRegisterTimer(stateTransferTimer_);
    LOG(INFO)
        << "The state transfer takes too long, roll back to previous step ";
    stateTransferTerminateCallback_();
    return;
  }

  StateTransferRequest request;
  request.set_view(viewId_);
  request.set_issynced(transferSyncedEntry_);
  request.set_replicaid(replicaId_);
  for (auto& stateTransferInfo : stateTransferIndices_) {
    // Do not request too many entries at one time, otherwise, UDP packet
    // cannot handle that
    uint32_t targetReplica = stateTransferInfo.first;
    uint32_t logBegin = stateTransferInfo.second.first;
    uint32_t logEnd = stateTransferInfo.second.second;

    request.set_logbegin(logBegin);
    if (logBegin + requestTrasnferBatch_ <= logEnd) {
      request.set_logend(logBegin + requestTrasnferBatch_);
    } else {
      request.set_logend(logEnd);
    }

    VLOG(3) << "I am asking stateTransferRequest from " << targetReplica << "\t"
            << request.logbegin() << "\t" << request.logend() << "\t"
            << "\tisSynced=" << request.issynced();
    masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[targetReplica]),
                                         request,
                                         MessageType::STATE_TRANSFER_REQUEST);
  }
}

void Replica::ProcessStateTransferRequest(
    const StateTransferRequest& stateTransferRequest) {
  VLOG(3) << "stateTransferRequest from Replica-"
          << stateTransferRequest.replicaid() << "\t||"
          << stateTransferRequest.logbegin() << "\t"
          << stateTransferRequest.logend() << "\tisSynced "
          << stateTransferRequest.issynced()
          << " view=" << stateTransferRequest.view();

  if (stateTransferRequest.view() != viewId_) {
    if (stateTransferRequest.view() > viewId_) {
      VLOG(2) << "InitiateViewChange-5";
      InitiateViewChange(stateTransferRequest.view());
    }
    return;
  }
  StateTransferReply reply;
  CrashVectorStruct* cv = crashVectorInUse_[0].load();
  const Address* requesterAddr =
      masterReceiver_[stateTransferRequest.replicaid()];
  reply.set_replicaid(replicaId_);
  reply.set_view(viewId_);
  reply.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end());
  reply.set_issynced(stateTransferRequest.issynced());

  if (reply.issynced()) {
    reply.set_logbegin(stateTransferRequest.logbegin());
    ASSERT(maxSyncedLogEntry_.load()->logId >= stateTransferRequest.logend());
    for (uint32_t j = stateTransferRequest.logbegin();
         j <= stateTransferRequest.logend(); j++) {
      LogEntry* entry = syncedLogEntryByLogId_.get(j);
      if (entry) {
        RequestBodyToMessage(entry->body, reply.add_reqs());
        reply.set_logend(j);
      } else {
        LOG(WARNING) << "Maybe just due to lag "
                     << stateTransferRequest.logend() << ">" << reply.logend();
        break;
      }
    }
    VLOG(3) << "State Reply " << reply.logbegin() << "--" << reply.logend();
  } else {
    reply.set_logbegin(stateTransferRequest.logbegin());
    reply.set_logend(stateTransferRequest.logend());
    ASSERT(filteredUnSyncedEntries_.size() > reply.logend());
    for (uint32_t j = reply.logbegin(); j <= reply.logend(); j++) {
      LogEntry* entry = filteredUnSyncedEntries_[j];
      ASSERT(entry != NULL);
      RequestBodyToMessage(entry->body, reply.add_reqs());
    }
    VLOG(3) << "Give " << reply.logbegin() << "-" << reply.logend();
  }
  if (reply.reqs_size() > 0) {
    masterContext_->endPoint_->SendMsgTo(*requesterAddr, reply,
                                         MessageType::STATE_TRANSFER_REPLY);
  }
}

void Replica::ProcessStateTransferReply(
    const StateTransferReply& stateTransferReply) {
  VLOG(3) << "Receive some state " << stateTransferReply.logbegin() << "--"
          << stateTransferReply.logend()
          << " view=" << stateTransferReply.view() << "--- "
          << transferSyncedEntry_ << "==" << stateTransferReply.issynced();
  if (status_ == ReplicaStatus::NORMAL) {
    // Normal replicas do not need state transfer
    return;
  }
  if (!CheckCV(stateTransferReply.replicaid(), stateTransferReply.cv())) {
    return;
  } else {
    Aggregated(stateTransferReply.cv());
  }

  if (!(masterContext_->endPoint_->isTimerRegistered(stateTransferTimer_))) {
    // We are not doing state transfer, so ignore this message
    return;
  }

  if (stateTransferReply.view() < viewId_) {
    // Old view: ignore
    return;
  } else if (stateTransferReply.view() > viewId_) {
    masterContext_->endPoint_->UnRegisterTimer(stateTransferTimer_);
    if (status_ == ReplicaStatus::RECOVERING) {
      // This state transfer is useless, stop it and restart recovery request
      masterContext_->endPoint_->RegisterTimer(recoveryRequestTimer_);
    } else if (status_ == ReplicaStatus::VIEWCHANGE) {
      VLOG(2) << "InitiateViewChange-6";
      InitiateViewChange(stateTransferReply.view());
    } else {
      LOG(ERROR) << "Unknown replica status " << (uint32_t)status_;
    }
    return;
  }

  // Else: Same view
  if (transferSyncedEntry_ != stateTransferReply.issynced()) {
    return;
  }

  const auto& iter = stateTransferIndices_.find(stateTransferReply.replicaid());
  if (iter == stateTransferIndices_.end() ||
      stateTransferReply.logend() < iter->second.first) {
    // We do not need these log entries
    return;
  }

  // So long as the state transfer is making progress, we should give it more
  // time instead of early termination
  // Only if the state transfer has not made progress within
  // stateTransferTimeout_. then we terminate it and rollback to some previous
  // function
  stateTransferTerminateTime_ =
      GetMicrosecondTimestamp() + +stateTransferTimeout_ * 1000;
  SHA_HASH dummy;
  if (stateTransferReply.issynced()) {
    // This is the state-transfer for synced requests
    for (uint32_t i = iter->second.first; i <= stateTransferReply.logend();
         i++) {
      const RequestBodyMsg& rbMsg =
          stateTransferReply.reqs(i - iter->second.first);
      LogEntry* entry = new LogEntry(
          rbMsg.deadline(), rbMsg.reqkey(), rbMsg.key(), rbMsg.proxyid(),
          rbMsg.command(), rbMsg.iswrite(), dummy, dummy);
      ProcessRequest(entry, true, false, false);
      // LOG(INFO) << "Processed " << entry->logId << "\t"
      //           << maxSyncedLogEntry_.load()->logId;
      // Register
      if (syncedLogEntryByReqKey_.get(entry->body.reqKey) == NULL) {
        syncedLogEntryByReqKey_.assign(entry->body.reqKey, entry);
        syncedLogEntryByLogId_.assign(entry->logId, entry);
        if (entry->logId > CONCURRENT_MAP_START_INDEX) {
          ASSERT(syncedLogEntryByLogId_.get(entry->logId - 1) != NULL);
          ASSERT(syncedLogEntryByLogId_.get(entry->logId - 1) == entry->prev);
        }
      }
    }
  } else {
    // This is the state-transfer for unsynced request (log merge)
    for (int i = 0; i < stateTransferReply.reqs_size(); i++) {
      const RequestBodyMsg& rbMsg = stateTransferReply.reqs(i);
      std::pair<uint64_t, uint64_t> key(rbMsg.deadline(), rbMsg.reqkey());
      if (requestsToMerge_.find(key) != requestsToMerge_.end()) {
        LogEntry* entry = new LogEntry(
            rbMsg.deadline(), rbMsg.reqkey(), rbMsg.key(), rbMsg.proxyid(),
            rbMsg.command(), rbMsg.iswrite(), dummy, dummy);

        requestsToMerge_[key] = {entry, 1};
      } else {
        requestsToMerge_[key].second++;
      }
    }
  }

  iter->second.first = stateTransferReply.logend() + 1;
  VLOG(2) << "Transfer Synced? " << stateTransferReply.issynced() << "\t"
          << " In Progress: " << iter->first << ":" << iter->second.first << "-"
          << iter->second.second;

  uint32_t remainingPercent =
      stateTransferIndicesRef_[stateTransferReply.replicaid()].second;
  if (remainingPercent > 10) {
    uint32_t previousGap =
        stateTransferIndicesRef_[stateTransferReply.replicaid()].first;
    uint32_t remainingGap = iter->second.second - iter->second.first;
    if (remainingGap * 100 / previousGap < remainingPercent) {
      LOG(INFO) << "State Tranfer from Replica "
                << stateTransferReply.replicaid() << "\t" << remainingPercent
                << "\% of progress (i.e., " << remainingGap
                << " logs) remaining\t"
                << "Current committedLogId_=" << committedLogId_
                << "\tmaxSyncedLogId=" << maxSyncedLogEntry_.load()->logId;
      ;
      stateTransferIndicesRef_[stateTransferReply.replicaid()].second -= 10;
    }
  }

  if (iter->second.first > iter->second.second) {
    // We have completed the state transfer for this target replica
    stateTransferIndices_.erase(iter->first);
  }

  if (stateTransferIndices_.empty()) {
    // This state transfer is completed, unregister the timer
    masterContext_->endPoint_->UnRegisterTimer(stateTransferTimer_);
    stateTransferIndices_.clear();
    stateTransferIndicesRef_.clear();
    // If we have a callback, then call it
    if (stateTransferCallback_) {
      stateTransferCallback_();
    }
  }
}

void Replica::RewindSyncedLogTo(uint32_t rewindPoint) {
  LOG(INFO) << "maxSyncedLogId_=" << maxSyncedLogEntry_.load()->logId << "\t"
            << "rewindPoint=" << rewindPoint;
  LogEntry* entryStart = maxSyncedLogEntry_;
  while (entryStart->logId > rewindPoint) {
    LogEntry* entryToDel = entryStart;
    if (entryToDel->prevNonCommutative) {
      entryToDel->prevNonCommutative->nextNonCommutative = NULL;
    }
    if (entryToDel->prev) {
      entryToDel->prev->next = NULL;
    }
    ASSERT(entryStart->prev != NULL);
    syncedLogEntryByReqKey_.erase(entryToDel->body.reqKey);
    syncedLogEntryByLogId_.erase(entryToDel->logId);
    entryStart = entryStart->prev;
    delete entryToDel;
  }
  entryStart->next = NULL;
  entryStart->nextNonCommutative = NULL;
  maxSyncedLogEntry_ = entryStart;
  trackedEntry_.assign(trackedEntry_.size(), maxSyncedLogEntry_);
}

void Replica::ProcessStartView(const StartView& startView) {
  VLOG(3) << startView.DebugString();

  if (!CheckCV(startView.replicaid(), startView.cv())) {
    return;
  } else {
    Aggregated(startView.cv());
  }

  if (status_ == ReplicaStatus::VIEWCHANGE) {
    if (startView.view() > viewId_) {
      VLOG(2) << "InitiateViewChange-7";
      InitiateViewChange(startView.view());
    } else if (startView.view() == viewId_) {
      if (committedLogId_ < startView.syncedlogid()) {
        // Start StateTransfer
        if (masterContext_->endPoint_->isTimerRegistered(stateTransferTimer_)) {
          // LOG(INFO) << "StateTransfer In Progress:"
          //           << stateTransferIndices_[startView.replicaid()].first
          //           << "--"
          //           << stateTransferIndices_[startView.replicaid()].second;
          return;
        }
        RewindSyncedLogTo(committedLogId_);
        stateTransferIndices_.clear();
        stateTransferIndicesRef_[startView.replicaid()] = {committedLogId_ + 1,
                                                           100};
        stateTransferIndices_[startView.replicaid()] = {
            committedLogId_ + 1, startView.syncedlogid()};
        stateTransferIndicesRef_[startView.replicaid()] = {committedLogId_ + 1,
                                                           100};
        stateTransferCallback_ = std::bind(&Replica::EnterNewView, this);
        stateTransferTerminateTime_ =
            GetMicrosecondTimestamp() + stateTransferTimeout_ * 1000;
        stateTransferTerminateCallback_ =
            std::bind(&Replica::RollbackToViewChange, this);

        transferSyncedEntry_ = true;
        masterContext_->endPoint_->RegisterTimer(stateTransferTimer_);
      } else {
        RewindSyncedLogTo(committedLogId_);
        EnterNewView();
      }

    }  // else: startView.view()<viewId_, old message, ignore it
  } else if (status_ == ReplicaStatus::NORMAL) {
    if (startView.view() > viewId_) {
      VLOG(2) << "InitiateViewChange-8";
      InitiateViewChange(startView.view());
    } else if (startView.view() < viewId_) {
      // My view is fresher
      SendStartView(startView.replicaid());
    }
    // Else: We are in the same view and this replica is normal, no need
    // startView
  }
  // If status == RECOVERING, it does not participate in view change
}

void Replica::BroadcastCrashVectorRequest() {
  CrashVectorRequest request;
  boost::uuids::random_generator generator;
  boost::uuids::uuid uuid = generator();
  nonce_ = boost::uuids::to_string(uuid);
  request.set_nonce(nonce_);
  request.set_replicaid(replicaId_);
  crashVectorReplySet_.clear();
  for (uint32_t i = 0; i < replicaNum_; i++) {
    if (i == replicaId_) {
      continue;
    }
    LOG(INFO) << "Ask CrashVector to Replica " << i;
    masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[i]), request,
                                         MessageType::CRASH_VECTOR_REQUEST);
  }
}

void Replica::BroadcastRecoveryRequest() {
  RecoveryRequest request;
  CrashVectorStruct* cv = crashVectorInUse_[0].load();
  request.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end());
  request.set_replicaid(replicaId_);
  for (uint32_t i = 0; i < replicaNum_; i++) {
    if (i == replicaId_) {
      continue;
    }
    masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[i]), request,
                                         MessageType::RECOVERY_REQUEST);
  }
}

void Replica::ProcessCrashVectorRequest(const CrashVectorRequest& request) {
  if (status_ != ReplicaStatus::NORMAL) {
    return;
  }

  CrashVectorReply reply;
  reply.set_nonce(request.nonce());
  reply.set_replicaid(replicaId_);
  CrashVectorStruct* cv = crashVectorInUse_[0].load();
  reply.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end());
  masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[request.replicaid()]),
                                       reply, MessageType::CRASH_VECTOR_REPLY);
}

void Replica::ProcessCrashVectorReply(const CrashVectorReply& reply) {
  if (status_ != ReplicaStatus::RECOVERING) {
    LOG(INFO) << "nolong Recovering " << status_;
    return;
  }

  if (nonce_ != reply.nonce()) {
    LOG(INFO) << "nonce inconistent " << crashVectorReplySet_.size();
    return;
  }

  if (masterContext_->endPoint_->isTimerRegistered(crashVectorRequestTimer_) ==
      false) {
    // We no longer request crash vectors
    LOG(INFO) << "no longer register crashVectorRequest "
              << crashVectorReplySet_.size();
    return;
  }

  crashVectorReplySet_[reply.replicaid()] = reply;

  if (crashVectorReplySet_.size() >= replicaNum_ / 2 + 1) {
    // Got enough quorum
    CrashVectorStruct* oldCV = crashVectorInUse_[0].load();
    CrashVectorStruct* newCV = new CrashVectorStruct(*oldCV);
    newCV->version_++;
    for (const auto& kv : crashVectorReplySet_) {
      for (uint32_t i = 0; i < replicaNum_; i++) {
        if (kv.second.cv(i) > newCV->cv_[i]) {
          newCV->cv_[i] = kv.second.cv(i);
        }
      }
    }
    // Increment self counter
    newCV->cv_[replicaId_]++;
    crashVector_.assign(newCV->version_, newCV);
    for (uint32_t i = 0; i < crashVectorVecSize_; i++) {
      crashVectorInUse_[i] = newCV;
    }
    masterContext_->endPoint_->UnRegisterTimer(crashVectorRequestTimer_);
    crashVectorReplySet_.clear();

    // Start Recovery Request
    masterContext_->endPoint_->RegisterTimer(recoveryRequestTimer_);
  }
}

void Replica::ProcessRecoveryRequest(const RecoveryRequest& request) {
  if (status_ != ReplicaStatus::NORMAL) {
    return;
  }

  if (!CheckCV(request.replicaid(), request.cv())) {
    return;
  } else {
    Aggregated(request.cv());
  }

  RecoveryReply reply;
  CrashVectorStruct* cv = crashVectorInUse_[0].load();
  reply.set_replicaid(replicaId_);
  reply.set_view(viewId_);
  reply.mutable_cv()->Add(cv->cv_.begin(), cv->cv_.end());
  reply.set_syncedlogid(maxSyncedLogEntry_.load()->logId);
  masterContext_->endPoint_->SendMsgTo(*(masterReceiver_[request.replicaid()]),
                                       reply, MessageType::RECOVERY_REPLY);
}

void Replica::ProcessRecoveryReply(const RecoveryReply& reply) {
  if (!CheckCV(reply.replicaid(), reply.cv())) {
    return;
  } else {
    if (Aggregated(reply.cv())) {
      // If cv is updated, then it is likely that some messages in
      // recoveryReplySet_ become stray, so remove them
      for (uint32_t i = 0; i < replicaNum_; i++) {
        auto iter = recoveryReplySet_.find(i);
        if (iter != recoveryReplySet_.end() &&
            (!CheckCV(i, iter->second.cv()))) {
          recoveryReplySet_.erase(i);
        }
      }
    }
  }

  if (masterContext_->endPoint_->isTimerRegistered(recoveryRequestTimer_) ==
      false) {
    // We no longer request recovery reply
    return;
  }
  recoveryReplySet_[reply.replicaid()] = reply;
  if (recoveryReplySet_.size() >= replicaNum_ / 2 + 1) {
    // Got enough quorum
    masterContext_->endPoint_->UnRegisterTimer(recoveryRequestTimer_);
    uint32_t maxView = 0;
    uint32_t syncedLogId = 0;
    for (const auto& kv : recoveryReplySet_) {
      if (kv.second.view() > maxView) {
        maxView = kv.second.view();
        syncedLogId = kv.second.syncedlogid();
      }
    }
    // Get the maxView, launch state transfer with the corresponding leader
    viewId_ = maxView;
    recoveryReplySet_.clear();
    LOG(INFO) << "Replica intends to enter View " << viewId_
              << " after recovery; the number of logs to recover is:"
              << syncedLogId;
    if (AmLeader()) {
      LOG(INFO) << "The recovered replica will become the leader in this view, "
                   "skip it!";
      // If the recoverying replica happens to be the leader of the new view,
      // don't participate. Wait until the healthy replicas elect a new leader
      usleep(1000);  // sleep some time and restart the recovery process
      masterContext_->endPoint_->RegisterTimer(recoveryRequestTimer_);
    } else {
      // Launch state transfer for synced log entries
      stateTransferIndices_.clear();
      if (syncedLogId >= CONCURRENT_MAP_START_INDEX) {
        // There are some synced log entries that should be transferred
        transferSyncedEntry_ = true;
        stateTransferIndices_[maxView % replicaNum_] = {
            CONCURRENT_MAP_START_INDEX, syncedLogId};

        stateTransferIndicesRef_[maxView % replicaNum_] = {
            syncedLogId - CONCURRENT_MAP_START_INDEX + 1, 100};
        LOG(INFO) << "Recover Logs from " << CONCURRENT_MAP_START_INDEX
                  << "\t to\t" << syncedLogId;
        stateTransferCallback_ = std::bind(&Replica::EnterNewView, this);
        stateTransferTerminateTime_ =
            GetMicrosecondTimestamp() + stateTransferTimeout_ * 1000;
        stateTransferTerminateCallback_ =
            std::bind(&Replica::RollbackToRecovery, this);
        masterContext_->endPoint_->RegisterTimer(stateTransferTimer_);
      } else {
        // No log entries to recover, directly enter new view
        EnterNewView();
      }
    }
  }
}

void Replica::ProcessSyncStatusReport(const SyncStatusReport& report) {
  if (!CheckCV(report.replicaid(), report.cv())) {
    // Stray message
    return;
  } else {
    if (Aggregated(report.cv())) {
      // Possibly make existing msg become stray
      for (uint32_t i = 0; i < replicaId_; i++) {
        auto iter = syncStatusSet_.find(i);
        if (iter != syncStatusSet_.end() && (!CheckCV(i, iter->second.cv()))) {
          syncStatusSet_.erase(i);
        }
      }
    }
  }

  if (!CheckView(report.view())) {
    return;
  }

  auto iter = syncStatusSet_.find(report.replicaid());
  if (iter == syncStatusSet_.end() ||
      iter->second.syncedlogid() < report.syncedlogid()) {
    syncStatusSet_[report.replicaid()] = report;
  }

  // LOG(INFO) << "sync size=" << syncStatusSet_.size();
  if (syncStatusSet_.size() >= replicaNum_ / 2 + 1) {
    uint32_t minLogId = UINT32_MAX;
    for (const auto& kv : syncStatusSet_) {
      if (minLogId > kv.second.syncedlogid()) {
        minLogId = kv.second.syncedlogid();
      }
    }
    // LOG(INFO) << "minLogId=" << minLogId << "\t" << committedLogId_;
    if (minLogId >= committedLogId_) {
      committedLogId_ = minLogId;
      // LOG(INFO) << "syncStauts " << report.DebugString();
      SendCommit();
    }
  }
}

void Replica::ProcessCommitInstruction(const CommitInstruction& commit) {
  if (!CheckCV(commit.replicaid(), commit.cv())) {
    return;
  } else {
    Aggregated(commit.cv());
  }
  if (!CheckView(commit.view())) {
    return;
  }

  lastHeartBeatTime_ = GetMicrosecondTimestamp();
  // LOG(INFO) << "commit " << commit.DebugString();
  // Buggy: should compare with syncedLogId, to see whether log is missing
  if (commit.committedlogid() > committedLogId_) {
    // Don't assign committedLogId_ directly, because this replica may have
    // not get enough synced logs
    toCommitLogId_ = commit.committedlogid();
    // LOG(INFO) << "committedLogId_=" << committedLogId_;
  }

  uint32_t nextCommitId = maxSyncedLogEntry_.load()->logId;
  if (toCommitLogId_ < nextCommitId) {
    nextCommitId = toCommitLogId_;
  }
  while (committedLogId_ < nextCommitId) {
    if (committedLogId_ < CONCURRENT_MAP_START_INDEX) {
      committedLogId_++;
      continue;
    }
    uint32_t preFetchTrackedLogId = trackedEntry_[0]->logId;
    LogEntry* entry = syncedLogEntryByLogId_.get(committedLogId_);
    if (entry == NULL) {
      if (committedLogId_ <= preFetchTrackedLogId) {
        LOG(INFO) << "committedLogId_=" << committedLogId_ << "\t"
                  << "maxSyncedLogId_=" << maxSyncedLogEntry_.load()->logId
                  << "\ttrackedLogId =" << preFetchTrackedLogId;
        for (uint32_t i = CONCURRENT_MAP_START_INDEX;
             i <= trackedEntry_[0]->logId; i++) {
          if (syncedLogEntryByLogId_.get(i) == NULL) {
            LOG(INFO) << "log " << i << " not recorded";
          }
        }
        LOG(ERROR) << "abnormal exit";
        exit(0);
      }
      if (viewId_ == 1) {
        LOG(INFO) << "committedLogId_=" << committedLogId_ << "\t"
                  << "maxSyncedLogId_=" << maxSyncedLogEntry_.load()->logId
                  << "\t"
                  << "\ttrackedLogId =" << trackedEntry_[0]->logId;
      }

      break;
    }
    ASSERT(entry != NULL);
    entry->result = ApplicationExecute(entry->body);
    committedLogId_++;
    // if (committedLogId_ % 1000 == 0) {
    //   LOG(INFO) << "committedLogId_=" << committedLogId_ << "\t"
    //             << "maxSyncedLogId_=" << maxSyncedLogEntry_.load()->logId;
    // }
  }
}

bool Replica::CheckView(const uint32_t view, const bool isMaster) {
  if (view < viewId_) {
    // old message
    return false;
  }
  if (view > viewId_) {
    if (isMaster) {
      if (status_ != ReplicaStatus::RECOVERING) {
        // Recovering replicas do not participate in view change
        VLOG(2) << "InitiateViewChange-9: " << view
                << "\t currentView=" << viewId_ << "\t"
                << "td=" << pthread_self();
        InitiateViewChange(view);
      }
    } else {
      // new view, update status and wait for master thread to handle the
      // situation
      status_ = ReplicaStatus::VIEWCHANGE;
    }

    return false;
  }
  return true;
}

bool Replica::CheckCV(const uint32_t senderId,
                      const google::protobuf::RepeatedField<uint32_t>& cv) {
  CrashVectorStruct* masterCV = crashVectorInUse_[0].load();
  return (cv.at(senderId) >= masterCV->cv_[senderId]);
}

bool Replica::Aggregated(const google::protobuf::RepeatedField<uint32_t>& cv) {
  CrashVectorStruct* masterCV = crashVectorInUse_[0].load();
  std::vector<uint32_t> maxCV(masterCV->cv_);
  bool needAggregate = false;
  for (uint32_t i = 0; i < replicaNum_; i++) {
    if (maxCV[i] < cv.at(i)) {
      // The incoming cv has fresher elements
      needAggregate = true;
      maxCV[i] = cv.at(i);
    }
  }
  if (needAggregate) {
    CrashVectorStruct* newCV =
        new CrashVectorStruct(maxCV, masterCV->version_ + 1);
    crashVector_.assign(newCV->version_, newCV);
    crashVectorInUse_[0] = newCV;
    if (status_ == ReplicaStatus::NORMAL) {
      // Wait until the reply threads has known the new cv
      while (true) {
        bool ready = true;
        for (uint32_t i = 1; i <= fastReplyQu_.size(); i++) {
          if (crashVectorInUse_[i].load()->version_ < newCV->version_) {
            ready = false;
          }
        }
        if (ready) {
          break;
        } else {
          usleep(1000);
        }
      }
    }  // Else (status_=ViewChange), then there is only master thread alive,
       // no need to wait for reply thread
  }
  return needAggregate;
}

void Replica::RollbackToViewChange() {
  LOG(INFO) << "Rollback to restart view change";
  status_ = ReplicaStatus::VIEWCHANGE;
  viewChangeSet_.clear();
  if (false == masterContext_->endPoint_->isTimerRegistered(viewChangeTimer_)) {
    masterContext_->endPoint_->RegisterTimer(viewChangeTimer_);
  }
}

void Replica::RollbackToRecovery() {
  LOG(INFO) << "Rollback to restart recovery";
  status_ = ReplicaStatus::RECOVERING;
  recoveryReplySet_.clear();
  // Since we start a new round of recovery, the logs obtained from the
  // previous round (if any) will not count. Delete them (=clean state) and
  // restart
  LogEntry* entryStart = syncedLogEntryHead_->next;
  while (entryStart) {
    LogEntry* entryToDel = entryStart;
    entryStart = entryStart->next;
    delete entryToDel;
  }
  maxSyncedLogEntry_ = syncedLogEntryHead_;
  maxSyncedLogEntryByKey_.assign(keyNum_, NULL);

  if (false ==
      masterContext_->endPoint_->isTimerRegistered(recoveryRequestTimer_)) {
    masterContext_->endPoint_->RegisterTimer(recoveryRequestTimer_);
  }
}

std::string Replica::ApplicationExecute(const RequestBody& request) {
  return "";
}

bool Replica::AmLeader() { return (viewId_ % replicaNum_ == replicaId_); }

}  // namespace nezha


================================================
FILE: replica/replica.h
================================================
#ifndef NEZHA_REPLICA_H
#define NEZHA_REPLICA_H

#include <yaml-cpp/yaml.h>
#include <boost/uuid/uuid.hpp>
#include <boost/uuid/uuid_generators.hpp>
#include <boost/uuid/uuid_io.hpp>
#include <condition_variable>
#include <fstream>
#include "lib/utils.h"
#include "proto/nezha_proto.pb.h"
#include "replica_config.h"

namespace nezha {
using namespace nezha::proto;
/** Receiver is more complex than sender. A sender only needs an endpoint.
 * But A Receiver needs an endpoint (endPoint_) to receive messages, and the
 * message should be handled bu an already-registered handler (msgHandlerFunc_).
 * Besides, in order to unblock the endpoint during view change, there is also a
 *  timer (monitorTimer_) needed, to keep monitor the status of the replica.
 *
 * We package all the necessary components into ReceiverContext for brievity
 */
struct ReceiverContext {
  Endpoint* endPoint_;
  void* context_;
  MessageHandlerFunc msgHandlerFunc_;
  Timer* monitorTimer_;
  ReceiverContext(Endpoint* ep = NULL, void* ctx = NULL,
                  MessageHandlerFunc msgFunc = nullptr, Timer* t = NULL)
      : endPoint_(ep),
        context_(ctx),
        msgHandlerFunc_(msgFunc),
        monitorTimer_(t) {}
  void Register(int endpointType = EndpointType::UDP_ENDPOINT) {
    if (endpointType == EndpointType::UDP_ENDPOINT) {
      // UDP Endpoint
      UDPMsgHandler* udpMsgHandler =
          new UDPMsgHandler(msgHandlerFunc_, context_);
      ((UDPSocketEndpoint*)endPoint_)->RegisterMsgHandler(udpMsgHandler);
      ((UDPSocketEndpoint*)endPoint_)->RegisterTimer(monitorTimer_);
    } else {
      // To support other types of endpoints later
      LOG(ERROR) << "unknown endpoint type " << (int)endpointType;
    }
  }
};

/**
 * Refer to replica_run.cc, the runnable program only needs to instantiate a
 * Replica object with a configuration file. Then it calls Run() method to run
 * and calls Terminate() method to stop
 */

class Replica {
 private:
  /** All the configuration parameters for the replica are included in
   * replicaConfig_*/
  ReplicaConfig replicaConfig_;
  /** 1 for UDP, 2 for GRPC (not supported yet) */
  int endPointType_;
  /** viewId_ starts from 0 */
  std::atomic<uint32_t> viewId_;
  std::atomic<uint32_t> lastNormalView_;
  /** replicaId_ starts from 0 */
  std::atomic<uint32_t> replicaId_;
  std::atomic<uint32_t> replicaNum_;

  /** Worker threads check status_ to decide whether they should be blocked (for
   * view change) */
  std::atomic<char> status_;

  /** Every unique request, sharded across several maps for concurrency.
   * Before a request is processed, it is addded to one of these maps by
   * recordThread. Map from reqKey -> logEntry */
  std::vector<ConcurrentMap<uint64_t, LogEntry*>> recordMap_;

  /** TrackThread traverses the synced log list and record in
   * syncedLogEntryByReqKey_ and syncedLogEntryByLogId_ */
  std::vector<LogEntry*> trackedEntry_;

  /** earlyBuffer_ uses the pair <deadline, reqKey> as key. std::map will sort
   * them in ascending order by default */
  std::map<std::pair<uint64_t, uint64_t>, LogEntry*> earlyBuffer_;

  /** lastReleasedEntryByKeys_ is used to support communativity, we record the
   * last relased entry for each key. When new requests come, it compares with
   * the last released entry in the same key */
  std::vector<std::pair<uint64_t, uint64_t>> lastReleasedEntryByKeys_;

  /**  keyNum_ indicates the number of keys that requests will work on (to
   * support commutativity optimization). We assume one request will only work
   * on one key */
  uint32_t keyNum_;

  /**
   * Log entries are organized as a list.
   * On the leader, it only needs to maintain one list, i.e., synced log list;
   * But on the follower, it maintains two lists, i.e., unsynced log list and
   * synced log list.
   *
   * syncedLogEntryHead_/unSyncedLogEntryHead_ are the starting point of the two
   * lists, which we crearte a dummy node for each list to serve as the head
   *
   * maxSyncedLogEntry_ and maxUnSyncedLogEntry_ are the tails of the two lists
   * respectively
   */
  LogEntry* syncedLogEntryHead_;
  LogEntry* unSyncedLogEntryHead_;
  std::atomic<LogEntry*> maxSyncedLogEntry_;
  std::atomic<LogEntry*> maxUnSyncedLogEntry_;
  /**
   * minUnSyncedLogEntry_ is initialized as unSyncedLogEntryHead_, but our
   * garbage-collection thread can advance it (TODO). In this way, it can
   * accelerate the generation of filteredUnSyncedEntries_
   */
  LogEntry* minUnSyncedLogEntry_;

  /**
   * These three vecs can be cosnidered as finer-grained version of
   * maxSyncedLogEntry_,maxUnSyncedLogEntry_ and minUnSyncedLogEntry_.
   * They are mainly used to support commutativity optimization.
   *
   * maxSyncedLogEntryByKey_ and minUnSyncedLogEntryByKey_ combine to work as
   * the sync-point, as illustrated in Figure 5 of our paper.
   */
  std::vector<LogEntry*> maxSyncedLogEntryByKey_;
  std::vector<LogEntry*> maxUnSyncedLogEntryByKey_;
  std::vector<LogEntry*> minUnSyncedLogEntryByKey_;

  /** Index Map, facilate for entry look-up */
  ConcurrentMap<uint64_t, LogEntry*> syncedLogEntryByReqKey_;
  ConcurrentMap<uint32_t, LogEntry*> syncedLogEntryByLogId_;

  /** Each thread is given a unique name (key) and stored in the pool */
  std::map<std::string, std::thread*> threadPool_;

  /** committedLogId_ and toCommitLogId_ are used for peridical synchronization
   * (to accelerate failure recovery) */
  std::atomic<uint32_t> committedLogId_;
  std::atomic<uint32_t> toCommitLogId_;

  /** Context (including a message handler and a monitor timer) */
  ReceiverContext* masterContext_;
  std::vector<ReceiverContext*> requestContext_;
  ReceiverContext* indexSyncContext_;
  ReceiverContext* missedIndexAckContext_;
  ReceiverContext* missedReqAckContext_;

  /** Timers
   *
   * Since message can be dropped after it is sent. For those messages which are
   * required to be eventually delivered, we register a timer to the endpoint,
   * which keeps sending the message, until the sender knows it is
   * delivered and unregister the timer
   */
  Timer* heartbeatCheckTimer_;
  Timer* indexAskTimer_;
  Timer* requestAskTimer_;
  Timer* viewChangeTimer_;
  Timer* stateTransferTimer_;
  Timer* periodicSyncTimer_;
  Timer* crashVectorRequestTimer_;
  Timer* recoveryRequestTimer_;

  /** Endpoints
   *
   * These endpoints are only used as senders, so they do not need the complex
   * context struct as receivers
   */
  std::vector<Endpoint*> indexSender_;  // send indices (Sec 5.4)
  std::vector<Endpoint*> fastReplySender_;
  std::vector<Endpoint*> slowReplySender_;
  Endpoint* indexRequester_; /** In the slow path, when indices are missing,
                                Follower uses this endpoint to send requests
                                asking for the missing indices */
  Endpoint* reqRequester_;   /** Follower uses this endpoint to send requests
                                asking for the missed requests */
  Endpoint* indexAcker_; /** Leader uses this endpoint to reply the indices to
                            the requested followers */

  /** Addresses */
  std::vector<Address*>
      indexReceiver_; /** Leader will send indices to these addresses (each
                         follower has such an address to receive index) */
  std::vector<Address*>
      indexAskReceiver_; /** Follower sends ask-requests to these addresses
                            when it is missing some indices */
  std::vector<Address*>
      requestAskReceiver_; /** Followers send ask-requests to these addresses
                              when it is missing some requests */
  std::vector<Address*>
      masterReceiver_; /** Each replica maintains a master thread, which
                          sends/receives/processes different types of control
                          messages, therefore, each replica matains such an
                          address vector (size of replicaNum) to know the
                          address of others' master thread */

  /* Round robin indices are used to achieve load balance among threads of the
   * same functionality (e.g., multiple reply threads) */
  uint32_t roundRobinProcessIdx_;
  uint32_t roundRobinIndexAskIdx_;
  uint32_t roundRobinRequestAskIdx_;

  /** Version-based CrashVector (version number as the key), to facilitate
   * garbage-collection */
  ConcurrentMap<uint32_t, CrashVectorStruct*> crashVector_;
  /** Each related thread (i.e. fast reply threads + index recv thread + index
   * ack thread) will hold an atomic pointer, pointing to the crash vector they
   * are currently using.
   *
   * The garbage collect thread will check crashVectorInUse_ to decide which
   * CrashVectorStruct can be safely reclaimed.
   *  */
  std::atomic<CrashVectorStruct*>* crashVectorInUse_;
  /** The number of threads using crash vectors (i.e., the length of
   * crashVectorInUse_) */
  uint32_t crashVectorVecSize_;

  /** The sync messages (for index sync process) which have not been processed
   */
  std::map<std::pair<uint32_t, uint32_t>, IndexSync> pendingIndexSync_;
  /** Each key in missedReqKeys_ indicating a request is missing on this replica
   */
  std::set<uint64_t> missedReqKeys_;

  /** Each pair indicates a segment of indices is missing during index sync
   * process */
  std::pair<uint32_t, uint32_t> missedIndices_;

  /** The max number of indices/reqKeys/requests that can be carried in one
   * stateTransfer message */
  uint32_t indexTransferBatch_;
  uint32_t requestKeyTransferBatch_;
  uint32_t requestTrasnferBatch_;

  /* State-Transfer related variables **/
  uint64_t stateTransferTimeout_;
  bool transferSyncedEntry_;
  /** key: the target replica to ask for requests; value: the segment <begin,
   * end> of requests that will be transferred */
  std::map<uint32_t, std::pair<uint32_t, uint32_t>> stateTransferIndices_;
  std::map<uint32_t, std::pair<uint32_t, uint32_t>>
      stateTransferIndicesRef_;  // Only serves as the references
  std::function<void(void)> stateTransferCallback_;
  /** The max amount of time that the state transfer can last */
  std::uint64_t stateTransferTerminateTime_;
  /** If the state transfer cannot be completed within
   * stateTransferTerminateTime_, execute the following callback and terminate
   * the state transfer */
  std::function<void(void)> stateTransferTerminateCallback_;

  /** Before transfer unsynced logs, the replica needs to first filter all the
   * unsynced logs, because most of them overlap with synced logs, which has
   * already been transferred, so we only need to transfer a small portion of
   * unsynced logs after filtering out those overlapped ones  */
  std::vector<LogEntry*> filteredUnSyncedEntries_;

  /** During leader election, the new leader use requestsToMerge_ to merge logs
   * collected from the quorum of replicas.
   *
   * Key: <deadline, reqKey>; Value: <request, the number of remaining replicas
   * containing this request>  */
  std::map<std::pair<uint64_t, uint64_t>, std::pair<LogEntry*, uint32_t>>
      requestsToMerge_;

  // Recovery related variables
  std::string nonce_;
  /** Key: replicaId. These structuers are used to check whether a quorum has
   * been formed */
  std::map<uint32_t, CrashVectorReply> crashVectorReplySet_;
  std::map<uint32_t, RecoveryReply> recoveryReplySet_;
  std::map<uint32_t, ViewChange> viewChangeSet_;
  std::map<uint32_t, SyncStatusReport> syncStatusSet_;

  /** Inserted by ReceiveThread, and looked up by
   * FastReplyThread/SlowReplyThread */
  ConcurrentMap<uint64_t, Address*> proxyAddressMap_;

  /** Followers periodically check lastHeartBeatTime_ to decide whether it
   * should issue view change
   *
   * lastHeartBeatTime_ is updated every time the follower receives a heartbeat
   * message (i.e. IndexSync and CommitInstruction)
   *  */
  std::atomic<uint64_t> lastHeartBeatTime_;

  /** Tentative-- TODO: Add more explanation */
  uint64_t lastAskMissedIndexTime_;
  uint64_t lastAskMissedRequestTime_;
  std::unordered_map<uint64_t, uint64_t> askTimebyReqKey_;
  std::vector<uint64_t> fetchTime_;

  /** Replicas use it to check whether every worker thread has stopped */
  std::atomic<uint32_t> activeWorkerNum_;
  /** The total number of worker threads. When terminating, replicas use this
   * variable to detect whether every thread has been terminated and exited */
  uint32_t totalWorkerNum_;
  /** To implement blocking mechanism, see BlockWhenStatusIsNot function */
  std::condition_variable waitVar_;
  std::mutex waitMutext_;

  ConcurrentQueue<uint64_t> tagQu_;  // For Debug, will be deleted
  /** To communicate between ReceiveThread and ProcessThread */
  ConcurrentQueue<LogEntry*> processQu_;
  /** To communicate between ReceiveThread and RecordThread */
  std::vector<ConcurrentQueue<RequestBody*>> recordQu_;
  /** To communicate between IndexRecvThread and IndexProcessThread */
  ConcurrentQueue<std::pair<MessageHeader*, char*>> indexQu_;

  /** To communinicate between ProcessThread and FastReplyThread */
  std::vector<ConcurrentQueue<LogEntry*>> fastReplyQu_;
  /** To communinicate between ProcessThread and SlowReplyThread */
  std::vector<ConcurrentQueue<LogEntry*>> slowReplyQu_;
  /** To communicate between ReceiveThread and OWDCalcThread (Transmit <proxyId,
   * owd>)
   */
  ConcurrentQueue<std::pair<uint64_t, uint32_t>> owdQu_;
  /** Record the one-way delay for each proxy. Updated by OWDCalcThread, read by
   * FastReplyThread/SlowReplyThread */
  ConcurrentMap<uint64_t, uint32_t> owdMap_;
  /** To window size used to estimate one-way delay */
  uint32_t slidingWindowLen_;
  double movingPercentile_;
  std::map<uint64_t, std::vector<uint32_t>> slidingWindow_;  // <proxyid, vec>
  std::map<uint64_t, uint64_t> owdSampleNum_;

  /** Garbage-Collection related variables */
  uint32_t reclaimTimeout_;
  /** The old versions of crash vectors in crashVector_ that can be reclaimed */
  uint32_t cvVersionToClear_;
  /**  GarbageCollectThread use prepareToClearLateBufferLogId_ to tell
   * IndexSyncThread that it intends to clear the requests before this point
   * [included]  */
  std::atomic<uint32_t> prepareToClearLateBufferLogId_;
  /**  GarbageCollectThread use prepareToClearLateBufferLogId_ to tell
   * IndexSyncThread and FastReplyThread that it intends to clear the log
   * entries before this point [included]  */
  std::atomic<uint32_t> prepareToClearUnSyncedLogId_;
  /** IndexSyncThread use safeToClearLateBufferLogId_ to tell
   * GarbageCollectThread that it can safely clear the requests in late buffer
   * up to this point [included]
   */
  std::atomic<uint32_t> safeToClearLateBufferLogId_;
  /** FastReplyThread(s) and IndexSyncThread use these atomic variables to tell
   * GarbageCollectThread, telling that it can safely clear unsynced log entries
   * up to this point [included] */
  std::atomic<uint32_t>* safeToClearUnSyncedLogId_;

  /** Create/Initialize all the necessary variables, it is only called once
   * during the lifetime of the replica */
  void CreateContext();
  /** Launch all the threads, only called once during the lifetime of the
   * replica */
  void LaunchThreads();
  /** After a view change or recovery is completed, the replica enters a new
   * view*/
  void EnterNewView();

  /** View Change (recovery) related */
  /** Reset the necessary variables. It is called every time when we initiate a
   * view change, and this function is  much more lightweight than CreateContext
   */
  void ResetContext();
  void InitiateViewChange(const uint32_t view);
  /** Send ViewChangeRequest to every replica and send ViewChange to the
   * leader. Used to instantiate viewChangeTimer_*/
  void BroadcastViewChange();
  /** Send ViewChangeRequest to a specific replica */
  void SendViewChangeRequest(const int toReplicaId);
  /** Send ViewChange to the leader(i.e., whose replicaId = view % replicaNum)
   */
  void SendViewChange();
  /** A crashed replica needs to first call InitiateRecovery in order to join
   * the system */
  void InitiateRecovery();
  /** The RECOVERING replica asks every healthy replica for crash vector */
  void BroadcastCrashVectorRequest();
  /** The RECOVERING replica asks every healthy replica for necessary recovery
   * information (e.g., the current view, the synced logs on that replica) */
  void BroadcastRecoveryRequest();
  /** The new leader, after fully recovery, send StartView to others */
  void SendStartView(const int toReplicaId);
  /** Replicas use state transfer to retrieve (large number of) log entries from
   * others. Used to instantiate stateTransferTimer_ */
  void SendStateTransferRequest();
  /** If the view change process takes too long and cannot be completed (this
   * can happen when the leader in the new view also fails), the replica will
   * terminate the current view change process and starts a new view change with
   * higher viewId */
  void RollbackToViewChange();
  /** If the recovery process takes too long and cannot be completed (this can
   * happen when the RECOVERING replica happens to be the leader in the new
   * view), this replica will terminate the in-progress recovery and starts a
   * new round of recovery, after the healthy replicas have elected a new leader
   among themseleves */
  void RollbackToRecovery();
  /** During view change, replicas may have some uncommitted requests, which
   * will not show in the new view, so replicas will rewind log list and
   * eliminate those uncommitted onces, and appended with the committed entries
   * from the leader
   */
  void RewindSyncedLogTo(uint32_t rewindPoint);

  /** Periodic Sync related */
  /** Followers periodically report their sync-point to the leader, so the
   * leader can decide the commit-point.
   * Used to instantiate periodicSyncTimer_ */
  void SendSyncStatusReport();
  /** Leader send commit-point to followers, so followers can safely execute the
  log entries up to commit-point. This is very useful to accelerate view change
  after the leader fails (details in  para. ``Acceleration of Recovery'' of Sec
  6 of our paper) */
  void SendCommit();

  /** Garbage-Collect related */
  /** If the logs (on the followers) have not been added into synced log list
   * and has been stayed on the replica for too long, then the garbage-collect
   * (gc) thread will reclaim it and free its memory */
  void ReclaimStaleLogs();
  void PrepareNextReclaim();
  /** If the crashVectorStruct is no longer used by any thread on this replica,
   * the gc-thread collects it */
  void ReclaimStaleCrashVector();

  /** Message handler */
  bool ProcessIndexSync(const IndexSync& idxSyncMsg);
  void ProcessViewChangeReq(const ViewChangeRequest& viewChangeReq);
  void ProcessViewChange(const ViewChange& viewChange);
  void ProcessStateTransferRequest(
      const StateTransferRequest& stateTransferReq);
  void ProcessStateTransferReply(const StateTransferReply& stateTransferRep);
  void ProcessStartView(const StartView& startView);
  void ProcessCrashVectorRequest(const CrashVectorRequest& request);
  void ProcessCrashVectorReply(const CrashVectorReply& reply);
  void ProcessRecoveryRequest(const RecoveryRequest& request);
  void ProcessRecoveryReply(const RecoveryReply& reply);
  void ProcessSyncStatusReport(const SyncStatusReport& report);
  void ProcessCommitInstruction(const CommitInstruction& commit);
  void ProcessRequest(LogEntry* rb, const bool isSyncedReq = true,
                      const bool sendReply = true,
                      const bool canExecute = false);

  /** The interfaces to bridge specific applications with Nezha */
  std::string ApplicationExecute(const RequestBody& request);

  /** Tools */
  /** Check whether this replica is leader, return true if it is */
  bool AmLeader();
  /** During view change, BlockWhenStatusIsNot uses the conditional variable
   * (waitVar_) to block the worker threads. Finally only the master thread is
   * alive, so that it can run the related procedure without risks of data race
   */
  void BlockWhenStatusIsNot(char targetStatus);

  /**
   * CheckView returns true if the message's view (Parameter-1) is consistent
   * with the replica's current view
   *
   * Master thread (isMaster) can initiate view change, non-master threads only
   * switch status to ViewChange  */
  bool CheckView(const uint32_t view, const bool isMaster = true);

  /** CheckCV checks the crashVector to decide whether the incoming message is
   * stray message. It returns true if the cv is valid (i.e., the message is not
   * stray message) */
  bool CheckCV(const uint32_t senderId,
               const google::protobuf::RepeatedField<uint32_t>& cv);

  /** Check whether the incoming message's crash vector (the passed-in cv) will
   * lead to the update of replica's crashVector (i.e., crashVector_[0]).
   * If it needs aggregation, this function will aggreate it and return true */
  bool Aggregated(const google::protobuf::RepeatedField<uint32_t>& cv);

  /**
   * During state transfer, the log transfer are divided into two parts, synced
   * log transfer and unsynced log transfer. Which are undertaken by the
   * following two functions
   */
  void TransferSyncedLog();
  void TransferUnSyncedLog();

  /**
   * After enough unsynced logs have been collected by the leader, the leader
   * merges the unsynced logs to deice which logs can be includec in the
   * newly-built log list (details in Sec 6 and Appendix A.3 of our paper)
   */
  void MergeUnSyncedLog();

  /** Convert our self-defined message to proto message */
  void RequestBodyToMessage(const RequestBody& rb, RequestBodyMsg* rbMsg);

  /** Threads
   *
   * Functions whose names are ended with ``Thread`` will be instianted with a
   * thread. Some functions are heavy and needed to be parallelized, so the
   * parallized threads with the same functionality are distinguished with the
   * first parameter, id.
   *
   * Some functions will also use crash vector, to distinguish the crash vectors
   * used by them, the functions also accept the second parameter, cvId.  */
  void ReceiveThread(int id = -1);
  void ProcessThread(int id = -1);
  void RecordThread(int id = -1);
  void TrackThread(int id = -1);
  void FastReplyThread(int id = -1, int cvId = -1);
  void SlowReplyThread(int id = -1);
  void IndexSendThread(int id = -1, int cvId = -1);
  void IndexRecvThread();
  void IndexProcessThread();
  void MissedIndexAckThread();
  void MissedReqAckThread();
  void OWDCalcThread();
  void GarbageCollectThread();

  /** Message handler functions
   * These message handler functions will be used to instantiate MessageHandlers
   * and attached to their related endpoints.
   */
  void ReceiveClientRequest(MessageHeader* msgHdr, char* msgBuffer,
                            Address* sender);
  void ReceiveIndexSyncMessage(MessageHeader* msgHdr, char* msgBuffer);
  void ReceiveAskMissedReq(MessageHeader* msgHdr, char* msgBuffer);
  void ReceiveAskMissedIdx(MessageHeader* msgHdr, char* msgBuffer);
  void ReceiveMasterMessage(MessageHeader* msgHdr, char* msgBuffer);

  /** Used to instantiate indexAskTimer_ */
  void AskMissedIndex();
  /** Used to instantiate requestAskTimer_*/
  void AskMissedRequest();
  /** Used to instantiate heartbeatCheckTimer_ */
  void CheckHeartBeat();

 public:
  /** Replica accepts a config file, which contains all the necessary
   * information to instantiate the object, then it can call Run method
   *
   * Specifically, if this replica has crashed before, it will recieve
   * isRecovering=true, then it first completes the recovery procedure before it
   * can join the system
   *  */
  Replica(
      const std::string& configFile = "../configs/nezha-replica-config.yaml",
      bool isRecovering = false);
  ~Replica();

  void Run();
  void Terminate();

  /** Tentative */
  std::atomic<uint32_t>* repliedSyncPoint_;
  uint32_t maxProxyNum_ = 16;
};

}  // namespace nezha

#endif

================================================
FILE: replica/replica_config.h
================================================
#include <glog/logging.h>
#include <stdint.h>
#include <yaml-cpp/yaml.h>
#include <string>
#include <vector>

struct ReplicaConfig {
  uint32_t endpointType;
  std::vector<std::string> replicaIps;
  int replicaId;
  int receiverShards;
  int recordShards;
  int replyShards;
  int trackShards;
  int receiverPort;
  int indexSyncPort;
  int requestAskPort;
  int indexAskPort;
  int masterPort;
  int monitorPeriodMs;
  int heartbeatThresholdMs;
  int indexAskPeriodMs;
  int viewChangePeriodMs;
  int stateTransferPeriodMs;
  int stateTransferTimeoutMs;
  int indexTransferBatch;
  int requestKeyTransferBatch;
  int requestTransferBatch;
  int requestAskPeriodMs;
  int crashVectorRequestPeriodMs;
  int recoveryRequestPeriodMs;
  int syncReportPeriodMs;
  int indexSyncPeriodUs;
  double movingPercentile;
  int keyNum;
  uint32_t owdEstimationWindow;
  uint32_t reclaimTimeoutMs;
  int indexSyncShards;

  // The number of threads to process requests. For now process-shards
  // is fixed to 1, because the early-buffer enque/deque is hard to
  // parallelize. Maybe later we can find a high-performant **concurrent
  // priority queue** for early-buffer, then process-shards may be
  // parallelized for higher performance
  int processShards = 1;

  // Parses yaml file configFilename and fills in fields of ReplicaConfig
  // accordingly. Returns an error message or "" if there are no errors.
  std::string parseConfig(std::string configFilename) {
    YAML::Node config;
    try {
      config = YAML::LoadFile(configFilename);
    } catch (const YAML::BadFile& e) {
      return "Error loading config file:" + e.msg + ".";
    }
    LOG(INFO) << "Using config:\n " << config;

    std::string key;  // Keep track of current key for better error messages
    try {
      key = "endpoint-type";
      endpointType = config[key].as<uint32_t>();
      key = "replica-ips";
      for (uint32_t i = 0; i < config[key].size(); i++) {
        replicaIps.push_back(config[key][i].as<std::string>());
      }
      key = "replica-id";
      replicaId = config[key].as<int>();
      key = "receiver-shards";
      receiverShards = config[key].as<int>();
      key = "record-shards";
      recordShards = config[key].as<int>();
      key = "reply-shards";
      replyShards = config[key].as<int>();
      key = "index-sync-shards";
      indexSyncShards = config[key].as<int>();
      key = "track-shards";
      trackShards = config[key].as<int>();
      key = "receiver-port";
      receiverPort = config[key].as<int>();
      key = "index-sync-port";
      indexSyncPort = config[key].as<int>();
      key = "request-ask-port";
      requestAskPort = config[key].as<int>();
      key = "index-ask-port";
      indexAskPort = config[key].as<int>();
      key = "master-port";
      masterPort = config[key].as<int>();
      key = "monitor-period-ms";
      monitorPeriodMs = config[key].as<int>();
      key = "heartbeat-threshold-ms";
      heartbeatThresholdMs = config[key].as<int>();
      key = "index-ask-period-ms";
      indexAskPeriodMs = config[key].as<int>();
      key = "view-change-period-ms";
      viewChangePeriodMs = config[key].as<int>();
      key = "request-ask-period-ms";
      requestAskPeriodMs = config[key].as<int>();
      key = "state-transfer-period-ms";
      stateTransferPeriodMs = config[key].as<int>();
      key = "state-transfer-timeout-ms";
      stateTransferTimeoutMs = config[key].as<int>();
      key = "index-transfer-batch";
      indexTransferBatch = config[key].as<int>();
      key = "request-key-transfer-batch";
      requestKeyTransferBatch = config[key].as<int>();
      key = "request-transfer-batch";
      requestTransferBatch = config[key].as<int>();
      key = "crash-vector-request-period-ms";
      crashVectorRequestPeriodMs = config[key].as<int>();
      key = "recovery-request-period-ms";
      recoveryRequestPeriodMs = config[key].as<int>();
      key = "sync-report-period-ms";
      syncReportPeriodMs = config[key].as<int>();
      key = "key-num";
      keyNum = config[key].as<int>();
      key = "owd-estimation-window";
      owdEstimationWindow = config[key].as<uint32_t>();
      key = "index-sync-period-us";
      indexSyncPeriodUs = config[key].as<uint32_t>();
      key = "moving-percentile";
      movingPercentile = config[key].as<double>();
      key = "reclaim-timeout-ms";
      reclaimTimeoutMs = config[key].as<uint32_t>();
      return "";
    } catch (const YAML::BadConversion& e) {
      if (config[key]) {
        return "Error parsing config field " + key + ": " + e.msg + ".";
      } else {
        return "Error parsing config field " + key + ": " + key + " not found.";
      }
    } catch (const std::exception& e) {
      return "Error parsing config field " + key + ": " + e.what() + ".";
    }
  }
};

================================================
FILE: replica/replica_run.cc
================================================
#include "replica/replica.h"
DEFINE_string(config, "nezhav2/config/nezha-replica-config-0.yaml", "The config file for the replica");
DEFINE_bool(isRecovering, false, "If this flag is true, then the replica will start recovery process once it is launched");

nezha::Replica* replica = NULL;
void Terminate(int para) {
    LOG(INFO) << "Terminating...";
    replica->Terminate();
}
int main(int argc, char* argv[]) {
    gflags::ParseCommandLineFlags(&argc, &argv, true);
    google::InitGoogleLogging(argv[0]);
    FLAGS_logtostderr = 1;
    signal(SIGINT, Terminate);
    replica = new nezha::Replica(FLAGS_config, FLAGS_isRecovering);
    replica->Run();
    LOG(INFO) << " Run Completed";
    delete replica;
    return 0;
}

================================================
FILE: scripts/analysis.py
================================================
import pandas as pd
from IPython import embed; 
import argparse
import datetime

LOGIN_PATH = "/home/steam1994"
FAST_REPLY = 6
SLOW_REPLY = 7
COMMIT_REPLY = 8


def throughput_apply_func(group):
    if len(group):
        return pd.Series({
            'AvgThroughput':len(group),
        })

def ThroughputAnalysis(merge_df):
    merge_df.loc[:, "time"] = merge_df['CommitTime'].apply(
                lambda us_ts: datetime.datetime.fromtimestamp(us_ts * 1e-6))
    bin_interval_s = 1
    grouped = merge_df.groupby(
        pd.Grouper(key='time', freq='{}s'.format(bin_interval_s)))
    grouped_apply_orders = grouped.apply(throughput_apply_func)
    grouped_apply_orders = grouped_apply_orders.dropna()
    grouped_apply_orders = grouped_apply_orders[5:-5]
    # print(grouped_apply_orders['AvgThroughput'])
    throughput = (grouped_apply_orders['AvgThroughput']/bin_interval_s).mean()
    return throughput


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('--num_replicas',  type=int, default = 3,
                        help='Specify the number of replicas ')
    parser.add_argument('--num_proxies',  type=int, default = 2,
                        help='Specify the number of proxies ')
    parser.add_argument('--num_clients',  type=int, default = 10,
                        help='Specify the number of clients ')
    args = parser.parse_args()

    num_replicas = args.num_replicas
    num_proxies = args.num_proxies
    num_clients = args.num_clients

    print("replicas: ", num_replicas)
    print("proxies: ", num_proxies)
    print("clients: ", num_clients)


    folder_name = "stats"
    stats_folder = "{login_path}/{folder_name}".format(
        login_path = LOGIN_PATH,
        folder_name = folder_name
    )
    client_df_list = []
    for i in range(num_clients):
        file_name = "Client-Stats-"+str(i+1)
        client_df = pd.read_csv(stats_folder+"/"+file_name)
        client_df_list.append(client_df)
    client_df = pd.concat(client_df_list)
    client_df['Latency'] = client_df['CommitTime']-client_df['SendTime']

    stats = ""
    stats += "Num:"+str(len(client_df))+"\n"
    stats += "50p:\t"+str(client_df['Latency'].quantile(.5))+"\n"
    stats += "75p:\t"+str(client_df['Latency'].quantile(.75))+"\n"
    stats += "90p:\t"+str(client_df['Latency'].quantile(.9))+"\n"
    fast_num = len(client_df[client_df['CommitType']== FAST_REPLY])
    stats += "Fast:\t"+str(fast_num/ len(client_df))+"\n"
    print(stats)

    throughput_stats = ThroughputAnalysis(client_df)
    print("Throughput ", throughput_stats)


    proxy_df_list = []
    for i in range(num_proxies):
        file_name = "Proxy-Stats-"+str(i+1)+".csv"
        proxy_df = pd.read_csv(stats_folder+"/"+file_name)
        proxy_df_list.append(proxy_df)
        print("Proxy ", len(proxy_df))
    proxy_df = pd.concat(proxy_df_list)

    proxy_df = proxy_df.sort_values(by=['ClientTime'])
    proxy_df["E2E"] = proxy_df["ProxyRecvTime"]-proxy_df["ProxyTime"]
    proxy_df["Bound"] = proxy_df["Deadline"]-proxy_df["ProxyTime"]
    fast_num = len(proxy_df[proxy_df["CommitType"]==6])
    print("fast commit ratio ", fast_num/len(proxy_df))
    print("Bound ", proxy_df["Bound"].quantile(.5))
    print("Proxy-E2E  50p ", proxy_df["E2E"].quantile(.5), 
            "\t75p:", proxy_df["E2E"].quantile(.75),
            "\t90p:", proxy_df["E2E"].quantile(.9),
            "\t95p:", proxy_df["E2E"].quantile(.95))


    # fast_df = proxy_df[proxy_df["SlowReplyTime"]==0].copy()
    # slow_df = proxy_df[proxy_df["SlowReplyTime"]>0].copy()
    # proxy_df['H1']=proxy_df['ProxyTime']-proxy_df["ClientTime"]
    # proxy_df['H2']=proxy_df['RecvTime']-proxy_df["ProxyTime"]
    # fast_df['F1']=fast_df['FastReplyTime']-fast_df["RecvTime"]
    # slow_df['HF1']=slow_df['SlowReplyTime']-slow_df["RecvTime"]
    # slow_df['HF3']=slow_df['SlowReplyTime']-slow_df["FastReplyTime"]
    # fast_df['H3']=fast_df['ProxyRecvTime']-fast_df["FastReplyTime"]
    # slow_df['H3']=slow_df['ProxyRecvTime']-slow_df["SlowReplyTime"]
    # fast_df['total'] = fast_df["ProxyRecvTime"] - fast_df["ClientTime"]
    # slow_df['total'] = slow_df["ProxyRecvTime"] - slow_df["ClientTime"]


    embed()


================================================
FILE: scripts/launch.py
================================================
import os
import subprocess
from subprocess import PIPE, Popen
import time
import ruamel.yaml
from termcolor import colored
import argparse


LOGIN_PATH = "/home/steam1994"
TAG = "opensource-test"
SSH_KEY = "/home/steam1994/.ssh/id_rsa"
ssh_identity = '-i {}'.format(SSH_KEY) if SSH_KEY else ''
# Prefix for SSH and SCP.
SSH = 'ssh {} -q -o ConnectTimeout=2 -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no '.format(
    ssh_identity)
SCP = 'scp -r {} -q -o ConnectTimeout=2 -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no '.format(
    ssh_identity)
USERNAME = "steam1994"
CMD_RETRY_TIMES = 3


def generate_ttcs_cfg_file(internal_ip, is_reference=False, use_ntp=False):
    if is_reference:
        content_str = '''management_address: "InternalIP"
log_dir: "/var/opt/ttcs/log"
subscription_mode: true
coordinator_address: "c-gjk1994gjk1994-c89e.gcp.clockwork.io"
coordinator_subscription_service_port: 6176
probe_address: "InternalIP"
clock_quality: 10
correct_clock: false'''
        cfg_file = content_str.replace("InternalIP", internal_ip)
        cfg_file_name = "ttcs-agent.cfg"
        with open(cfg_file_name, "w") as f:
            f.write(cfg_file)
        f.close()
        return cfg_file_name
    else:
        if use_ntp:
            content_str = '''management_address: "InternalIP"
log_dir: "/var/opt/ttcs/log"
subscription_mode: true
coordinator_address: "c-gjk1994gjk1994-c89e.gcp.clockwork.io"
coordinator_subscription_service_port: 6176
probe_address: "InternalIP"
clock_quality: 1
correct_clock: false'''
        else:
            content_str = '''management_address: "InternalIP"
log_dir: "/var/opt/ttcs/log"
subscription_mode: true
coordinator_address: "c-gjk1994gjk1994-c89e.gcp.clockwork.io"
coordinator_subscription_service_port: 6176
probe_address: "InternalIP"
clock_quality: 1
correct_clock: true'''
        cfg_file = content_str.replace("InternalIP", internal_ip)
        cfg_file_name = "ttcs-agent.cfg"
        with open(cfg_file_name, "w") as f:
            f.write(cfg_file)
        f.close()
        return cfg_file_name


def retry_proc_error(procs_list):
    procs_error = []
    for server, proc, cmd in procs_list:
        output, err = proc.communicate()
        if proc.returncode != 0:
            proc = Popen(cmd.split(), stdout=PIPE, stderr=PIPE)
            procs_error.append((server, proc, cmd))
    return procs_error


def start_ttcs_node(internal_ip, is_reference, use_ntp=False):
    clean_prev_deb_cmd = "sudo dpkg -P ttcs-agent"
    run_command([internal_ip], clean_prev_deb_cmd, in_background=False)
    install_deb_cmd = "sudo dpkg -i /home/steam1994/ttcs-agent_1.0.21_amd64.deb"
    #install_deb_cmd = "sudo dpkg -i /root/ttcs-agent_1.0.12_amd64.deb"
    run_command([internal_ip], install_deb_cmd, in_background=False)

    cfg_file = generate_ttcs_cfg_file(internal_ip, is_reference, use_ntp)
    local_file_path = "./ttcs-agent.cfg"
    remote_dir = "/etc/opt/ttcs"
    remote_path = remote_dir + "/ttcs-agent.cfg"

    chmod_cmd = "sudo chmod -R 777 {remote_dir}".format(remote_dir=remote_dir)
    run_command([internal_ip], chmod_cmd, in_background=False)

    rm_cmd = "sudo rm -f {remote_path}".format(remote_path=remote_path)
    run_command([internal_ip], rm_cmd, in_background=False)

    scp_files([internal_ip], local_file_path, remote_path, to_remote=True)

    if is_reference is not True and use_ntp is False:
        stop_ntp_cmd = "sudo systemctl stop ntp"
        run_command([internal_ip], stop_ntp_cmd, in_background=False)
        disable_ntp_cmd = "sudo systemctl disable ntp"
        run_command([internal_ip], disable_ntp_cmd, in_background=False)
        stop_ntp_cmd = "sudo systemctl stop chronyd"
        run_command([internal_ip], stop_ntp_cmd, in_background=False)
        disable_ntp_cmd = "sudo systemctl disable chronyd"
        run_command([internal_ip], disable_ntp_cmd, in_background=False)
    else:
        enable_ntp_cmd = "sudo systemctl enable chronyd"
        run_command([internal_ip], enable_ntp_cmd, in_background=False)
        start_ntp_cmd = "sudo systemctl start chronyd"
        run_command([internal_ip], start_ntp_cmd, in_background=False)

    sys_start_ttcp_agent_cmd = "sudo systemctl start ttcs-agent"
    run_command([internal_ip], sys_start_ttcp_agent_cmd, in_background=False)


def launch_ttcs(server_ip_list):
    stop_ntp_cmd = "sudo systemctl stop chronyd"
    run_command(server_ip_list, stop_ntp_cmd, in_background=False)
    disable_ntp_cmd = "sudo systemctl disable chronyd"
    run_command(server_ip_list, disable_ntp_cmd, in_background=False)
    stop_ntp_cmd = "sudo systemctl stop ntp"
    run_command(server_ip_list, stop_ntp_cmd, in_background=False)
    disable_ntp_cmd = "sudo systemctl disable ntp"
    run_command(server_ip_list, disable_ntp_cmd, in_background=False)
    sys_start_ttcp_agent_cmd = "sudo systemctl start ttcs-agent"
    run_command(server_ip_list, sys_start_ttcp_agent_cmd, in_background=False)


def scp_files(server_ip_list, local_path_to_file, remote_dir, to_remote):
    '''
    copies the file in 'local_path_to_file' to the 'remote_dir' in all servers
    whose external ip addresses are in 'server_ip_list'

    args
        server_ip_list: list of external IP addresses to communicate with
        local_path_to_file: e.g. ./script.py
        remote_dir: e.g. ~
        to_remote: whether to copy to remote (true) or vice versa (false)
    returns
        boolean whether operation was succesful on all servers or not
    '''
    src = remote_dir if not to_remote else local_path_to_file
    src_loc = 'remote' if not to_remote else 'local'
    dst = remote_dir if to_remote else local_path_to_file
    dst_loc = 'remote' if to_remote else 'local'

    message = 'from ({src_loc}) {src} to ({dst_loc}) {dst}'.format(
        src_loc=src_loc, src=src, dst_loc=dst_loc, dst=dst)
    print('---- started scp {}'.format(message))

    procs = []
    for server in server_ip_list:
        if to_remote:
            cmd = '{} {} {}@{}:{}'.format(SCP, local_path_to_file,
                                          USERNAME, server, remote_dir)
            proc = Popen(cmd.split(), stdout=PIPE, stderr=PIPE)
        else:
            cmd = '{} {}@{}:{} {}'.format(SCP, USERNAME, server,
                                          remote_dir, local_path_to_file)
            proc = Popen(cmd.split(), stdout=PIPE, stderr=PIPE)
        # print("scp cmd ", cmd)
        procs.append((server, proc, cmd))

    success = True
    procs_error = retry_proc_error(procs)
    retries = 1
    while retries < CMD_RETRY_TIMES and procs_error:
        procs_error = retry_proc_error(procs)
        retries += 1

    if retries >= CMD_RETRY_TIMES and procs_error:
        success = False
        for server, proc, cmd in procs_error:
            output, err = proc.communicate()
            if proc.returncode != 0:
                print(
                    colored('[{}]: FAIL SCP - [{}]'.format(server, cmd),
                            'yellow'))
                print(colored('Error Response:', 'blue', attrs=['bold']),
                      proc.returncode, output, err)

    if success:
        print(
            colored('---- SUCCESS SCP {} on {}'.format(message,
                                                       str(server_ip_list)),
                    'green',
                    attrs=['bold']))
    else:
        print(
            colored('---- FAIL SCP {}'.format(message), 'red', attrs=['bold']))
    return success


def run_command(server_ip_list, cmd, in_background=True):
    '''
    runs the command 'cmd' in all servers whose external ip addresses are 
    in 'server_ip_list'

    cfg
        server_ip_list: list of external IP addresses to communicate with
        cmd: command to run
    returns
        boolean whether operation was succesful on all servers or not
    '''
    if not in_background:
        print('---- started to run command - [{}] on {}'.format(
            cmd, str(server_ip_list)))
    else:
        print(
            colored('---- started to run [IN BACKGROUND] command - [{}] on {}'.
                    format(cmd, str(server_ip_list)),
                    'blue',
                    attrs=['bold']))
    procs = []
    for server in server_ip_list:
        ssh_cmd = '{} {}@{} {}'.format(SSH, USERNAME, server, cmd)
        proc = Popen(ssh_cmd.split(), stdout=PIPE, stderr=PIPE)
        procs.append((server, proc, ssh_cmd))

    success = True
    output = ''
    if not in_background:
        procs_error = retry_proc_error(procs)
        retries = 1
        while retries < CMD_RETRY_TIMES and procs_error:
            procs_error = retry_proc_error(procs)
            retries += 1

        if retries >= CMD_RETRY_TIMES and procs_error:
            success = False
            for server, proc, cmd in procs_error:
                output, err = proc.communicate()
                if proc.returncode != 0:
                    print(
                        colored(
                            '[{}]: FAIL run command - [{}]'.format(
                                server, cmd), 'yellow'))
                    print(colored('Error Response:', 'blue', attrs=['bold']),
                          proc.returncode, output, err)

        if success:
            print(
                colored('---- SUCCESS run command - [{}] on {}'.format(
                    cmd, str(server_ip_list)),
                        'green',
                        attrs=['bold']))
        else:
            print(
                colored('---- FAIL run command - [{}]'.format(cmd),
                        'red',
                        attrs=['bold']))

    return success, output


def create_instance(instance_name,
                    image=None,
                    machine_type = "n1-standard-4",
                    customzedZone = "us-central1-a",
                    customzedIp = None,
                    require_external_ip=False,
                    second_ip = False
                    ):
    # Construct gcloud command to create instance.
    

    network_address_config = ("--network-interface no-address"
                              if require_external_ip == False else "")
    
    if customzedIp is not None:
        network_address_config += ",private-network-ip="+customzedIp
        
    if second_ip:
        network_address_config += " --network-interface subnet=subnet-1,no-address"
    # scopes = "--scopes storage-full,https://www.googleapis.com/auth/bigtable.admin,https://www.googleapis.com/auth/bigtable.data,https://www.googleapis.com/auth/bigquery"
    # if full_access_to_cloud_apis:
    scopes = "--scopes=https://www.googleapis.com/auth/cloud-platform"

    create_instance_cmd = """gcloud beta compute instances create {inst} --zone {zone} --image-family {source_image} --machine-type {machine_type} {network} {scopes} --boot-disk-size 50GB""".format(
        inst=instance_name,
        zone=customzedZone,
        source_image=image,
        machine_type=machine_type,
        network=network_address_config,
        scopes=scopes,
    )

    # print(create_instance_cmd)
    # Run gcloud command to create machine.
    proc = Popen(create_instance_cmd, stdout=PIPE, stderr=PIPE, shell=True)
    # Wait for the process end and print error in case of failure
    output, error = proc.communicate()
    if proc.returncode != 0:
        print(colored("Failed to create instance", color="red",
                      attrs=["bold"]))
        print(colored("Error Response: ", color="blue", attrs=["bold"]),
              output, error)


def del_instance_list(instance_list, zone="us-central1-a"):
    for machine in instance_list:
        print(colored("Deleting "+machine, "red", attrs=['bold']))
        subprocess.Popen(
            'gcloud -q compute instances delete {inst} --zone {zone}'.format(
                inst=machine, zone=zone).split())

def stop_instance_list(instance_list, zone="us-central1-a"):
    stop_cmd = 'gcloud compute instances stop {inst} --zone {zone}'.format(
            inst=' '.join(instance_list), zone = zone
            )
    print(stop_cmd)
    os.system(stop_cmd)


def start_instance_list(instance_list, zone="us-central1-a"):
    start_cmd = 'gcloud compute instances start {inst} --zone {zone}'.format(
            inst=' '.join(instance_list), zone = zone
            )
    print(start_cmd)
    os.system(start_cmd)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('--num_replicas',  type=int, default = 3,
                        help='Specify the number of replicas ')
    parser.add_argument('--num_proxies',  type=int, default = 2,
                        help='Specify the number of proxies ')
    parser.add_argument('--num_clients',  type=int, default = 10,
                        help='Specify the number of clients ')
    args = parser.parse_args()

    num_replicas = args.num_replicas
    num_proxies = args.num_proxies
    num_clients = args.num_clients
    print("replicas: ", num_replicas)
    print("proxies: ", num_proxies)
    print("clients: ", num_clients)

    
    # cfg_file_name = generate_ttcs_cfg_file("10.128.3.79", is_reference=True, use_ntp=False)
    
    replica_ips = ["10.128.2."+str(i+10) for i in range(3)]
    proxy_ips = ["10.128.2."+str(i+20) for i in range(3, 5) ]
    client_ips = ["10.128.2."+str(i+30) for i in range(5, 15) ]

    replica_ips = replica_ips[0:num_replicas]
    proxy_ips = proxy_ips[0:num_proxies]
    client_ips = client_ips[0:num_clients]

    replica_name_list = [TAG+"-replica-"+str(i) for i in range(num_replicas) ]
    proxy_name_list = [ TAG+"-proxy-"+str(i) for i in range(num_proxies) ]
    client_name_list = [ TAG+"-client-"+str(i) for i in range(num_clients) ]

    vm_ips = replica_ips + proxy_ips + client_ips
    vm_name_list = replica_name_list + proxy_name_list + client_name_list

    replica_vm_type = "n1-standard-16"
    proxy_vm_type = "n1-standard-32"
    client_vm_type = "n1-standard-4"

    binary_path = "{login_path}/nezhav2/bazel-bin/".format(login_path = LOGIN_PATH)

    config_path = "{login_path}/nezhav2/configs".format(login_path = LOGIN_PATH)

    yaml = ruamel.yaml.YAML()


    # for i in range(num_replicas):
    #     create_instance(instance_name = replica_name_list[i],
    #                     image= "opensource-nezha",
    #                     machine_type =  replica_vm_type,
    #                     customzedZone="us-central1-a",
    #                     customzedIp = replica_ips[i] )
    #     print(colored("Created "+replica_name_list[i], "green", attrs=['bold']))
        
    
    # for i in range(num_proxies):
    #     create_instance(instance_name = proxy_name_list[i],
    #                     image= "opensource-nezha",
    #                     machine_type =  proxy_vm_type,
    #                     customzedZone="us-central1-a",
    #                     customzedIp = proxy_ips[i] )
    #     print(colored("Created "+proxy_name_list[i], "green", attrs=['bold']))
        

    # for i in range(num_clients):
    #     create_instance(instance_name = client_name_list[i],
    #                     image= "opensource-nezha",
    #                     machine_type =  client_vm_type,
    #                     customzedZone="us-central1-a",
    #                     customzedIp = client_ips[i] )
    #     print(colored("Created "+client_name_list[i], "green", attrs=['bold']))


    # time.sleep(120)
    # for i in range(len(vm_ips)):
    #     start_ttcs_node(vm_ips[i],False)
    # exit(0)

    #### del_instance_list(instance_list=vm_name_list)


    # stop_instance_list(instance_list = vm_name_list)
    # exit(0)


    # start_instance_list(instance_list = vm_name_list)
    # time.sleep(60)
    # print(vm_ips)
    # launch_ttcs(vm_ips)
    # exit(0)


    # Generate configs
    for i in range(num_replicas):
        config_template = "{config_path}/nezha-replica-config-template.yaml".format(config_path = config_path)
        config_file =  "{config_path}/nezha-replica-config-{idx}.yaml".format(config_path=config_path, idx =i)
        f = open(config_template, "r")
        yaml_data = yaml.load(f)
        yaml_data["replica-id"] = i
        yaml_data["replica-ips"] = replica_ips
        out_file = open(config_file, "w")
        yaml.indent(sequence=4, offset=2)
        yaml.dump(yaml_data, out_file)
        

    for i in range(num_proxies):
        config_template = "{config_path}/nezha-proxy-config-template.yaml".format(config_path = config_path)
        config_file =  "{config_path}/nezha-proxy-config-{idx}.yaml".format(config_path=config_path, idx =i+1)
        f = open(config_template, "r")
        yaml_data = yaml.load(f)
        yaml_data["proxy-info"]["proxy-id"] = i + 1
        yaml_data["proxy-info"]["proxy-ip"] = proxy_ips[i]
        yaml_data["replica-info"]["replica-ips"] = replica_ips
        out_file = open(config_file, "w")
        yaml.indent(sequence=4, offset=2)
        yaml.dump(yaml_data, out_file)
        

    for i in range(num_clients):
        config_template = "{config_path}/nezha-client-config-template.yaml".format(config_path = config_path)
        config_file =  "{config_path}/nezha-client-config-{idx}.yaml".format(config_path = config_path, idx= i+1)

        f = open(config_template, "r")
        yaml_data = yaml.load(f)
        yaml_data["proxy-info"]["proxy-ips"] = proxy_ips
        yaml_data["client-info"]["client-id"] = i+1
        yaml_data["client-info"]["client-ip"] = client_ips[i]
        out_file = open(config_file, "w")
        yaml.indent(sequence=4, offset=2)
        yaml.dump(yaml_data, out_file)


    # Copy config
    for i in range(num_replicas):
        config_file =  "{config_path}/nezha-replica-config-{idx}.yaml".format(config_path=config_path, idx =i)
        scp_files([replica_ips[i]], config_file, config_file, to_remote = True)

    for i in range(num_proxies):
        config_file =  "{config_path}/nezha-proxy-config-{idx}.yaml".format(config_path=config_path, idx =i+1)
        scp_files([proxy_ips[i]], config_file, config_file, to_remote = True)
    
    for i in range(num_clients):
        config_file =  "{config_path}/nezha-client-config-{idx}.yaml".format(config_path = config_path, idx= i+1)
        scp_files([client_ips[i]], config_file, config_file, to_remote = True)

    # exit(0)

    remote_path = "{login_path}/nezhav2/bazel-bin/*".format(login_path = LOGIN_PATH)
    rm_cmd = "sudo rm -rf {remote_path}".format(remote_path=remote_path)
    run_command(vm_ips, rm_cmd, in_background=False)

    mkdir_cmd = "mkdir -p {binary_path}/replica".format(binary_path = binary_path)
    run_command(vm_ips, mkdir_cmd, in_background=False)

    mkdir_cmd = "mkdir -p {binary_path}/proxy".format(binary_path = binary_path)
    run_command(vm_ips, mkdir_cmd, in_background=False)
    
    mkdir_cmd = "mkdir -p {binary_path}/client".format(binary_path = binary_path)
    run_command(vm_ips, mkdir_cmd, in_background=False)

    binary_file = "{binary_path}/client/nezha_client".format(binary_path=binary_path)
    scp_files(vm_ips, binary_file, binary_file, to_remote = True)

    binary_file = "{binary_path}/replica/nezha_replica".format(binary_path=binary_path)
    scp_files(vm_ips, binary_file, binary_file, to_remote = True)
    
    binary_file = "{binary_path}/proxy/nezha_proxy".format(binary_path=binary_path)
    scp_files(vm_ips, binary_file, binary_file, to_remote = True)
    

    # Kill existing procs
    kill_cmd = "sudo pkill -9 replica"
    run_command(vm_ips, kill_cmd, in_background=False)
    kill_cmd = "sudo pkill -9 proxy"
    run_command(vm_ips, kill_cmd, in_background=False)
    kill_cmd = "sudo pkill -9 client"
    run_command(vm_ips, kill_cmd, in_background=False)


    ## Launch replicas (id starts from 0)
    for i in range(num_replicas):
        replica_cmd = "{binary_path}/replica/nezha_replica --config {config_path}/nezha-replica-config-{idx}.yaml > {log_file} 2>&1 &".format(
            binary_path = binary_path,
            config_path = config_path,
            idx  =i,
            log_file = "replica-log-"+str(i) 
        )
        print(colored(replica_cmd, "yellow", attrs=['bold']))
        run_command([replica_ips[i]], replica_cmd, in_background=False)

    # input("stop...")
    # Launch proxies (id starts from 1)
    for i in range(num_proxies):
        proxy_cmd = "{binary_path}/proxy/nezha_proxy --config {config_path}/nezha-proxy-config-{idx}.yaml  > {log_file} 2>&1 &".format(
            binary_path = binary_path,
            config_path = config_path,
            idx = i+1,
            log_file = "proxy-log-"+str(i+1) 
        )
        print(colored(proxy_cmd, "yellow", attrs=['bold']))
        run_command([proxy_ips[i]], proxy_cmd, in_background = False)

   
    # Launch clients (id starts from 2)
    for i in range(num_clients):
        client_cmd = "{binary_path}/client/nezha_client --config {config_path}/nezha-client-config-{idx}.yaml >{log_file} 2>&1 &".format(
            binary_path = binary_path,
            config_path = config_path,
            idx = i+1,
            log_file = "client-log-"+str(i+1) 
        ) 
        print(colored(client_cmd, "yellow", attrs=['bold']))
        run_command([client_ips[i]], client_cmd, in_background = True)
        

    print("Sleep...")
    time.sleep(90)

    # Copy Stats File
    folder_name = "stats"
    stats_folder = "{login_path}/{folder_name}".format(
        login_path = LOGIN_PATH,
        folder_name = folder_name
    )
    mkdir_cmd = "sudo mkdir -p -m 777 {stats_folder}".format(stats_folder = stats_folder)
    os.system(mkdir_cmd)

    for i in range(num_clients):
        file_name = "Client-Stats-"+str(i+1)
        local_file_path = "{stats_folder}/{file_name}".format(
            stats_folder = stats_folder,
            file_name = file_name
        )
        remote_path = "{stats_folder}/{file_name}".format(
            stats_folder = LOGIN_PATH,
            file_name = file_name
        )
        scp_files([client_ips[i]], local_file_path, remote_path, to_remote=False)


    for i in range(num_proxies):
        file_name = "Proxy-Stats-"+str(i+1)+".csv"
        local_file_path = "{stats_folder}/{file_name}".format(
            stats_folder = stats_folder,
            file_name = file_name
        )
        remote_path = "{stats_folder}/{file_name}".format(
            stats_folder = LOGIN_PATH,
            file_name = file_name
        )
        scp_files([proxy_ips[i]], local_file_path, remote_path, to_remote=False)

================================================
FILE: scripts/local_test.sh
================================================
#!/bin/bash
export FLAGS_alsologtostderr=1

echo "Launching replica 0..."
(./bazel-bin/replica/nezha_replica --config ./configs/local/nezha-replica-config-0.yam & )

echo "Launching replica 1..."
(./bazel-bin/replica/nezha_replica --config ./configs/local/nezha-replica-config-1.yaml  &)

echo "Launching replica 2..."
(./bazel-bin/replica/nezha_replica --config ./configs/local/nezha-replica-config-2.yaml &)

echo "Launching proxy..."
(./bazel-bin/proxy/nezha_proxy --config ./configs/local/nezha-proxy-config.yaml &)

echo "Launching client..."
./bazel-bin/client/nezha_client --config ./configs/local/nezha-client-config.yaml


# TODO(Katie): This is currently only checking if at least one request succeeded. 
# It does not check if the client/replica/proxy failed for some reason
file="Client-Stats-1"
if [ -e "$file" ]; then
    line_count=$(wc -l < "$file")
    if [ "$line_count" -le 1 ]; then
        echo "File '$file' exists but has only one line."
        echo "No successful requests."
        exit 1 
    else
        echo "Success. File '$file' exists and has more than one line."
    fi
else
    echo "File '$file' does not exist."
    exit 1
fi


# Exit gracefully for github actions. It's okay if there are stray replica processes.
if [[ "$1" == "--github" ]]; then
    exit 0 
fi

# Kill replicas
trap 'trap - SIGTERM && kill 0' SIGINT SIGTERM EXIT

================================================
FILE: scripts/ttcs-agent.cfg
================================================
management_address: "10.128.2.15"
log_dir: "/var/opt/ttcs/log"
subscription_mode: true
coordinator_address: "c-gjk1994gjk1994-c89e.gcp.clockwork.io"
coordinator_subscription_service_port: 6176
probe_address: "10.128.2.15"
clock_quality: 1
correct_clock: true

================================================
FILE: third_party/concurrentqueue/BUILD.bazel
================================================
cc_library(
    name = "concurrentqueue", 
    srcs = ["concurrentqueue.h"],
    visibility = [
        "//visibility:public",
    ],
)


================================================
FILE: third_party/glog/BUILD.bazel
================================================


================================================
FILE: third_party/glog/BUILD.glog
================================================
licenses(['notice'])

load('@//third_party/glog:glog.bzl', 'glog_library')

glog_library('')


================================================
FILE: third_party/glog/glog.bzl
================================================
"""glog library build rule."""

load("@rules_cc//cc:defs.bzl", "cc_library")

def glog_library(name, namespace = "google", with_gflags = 1):
    """Implement a macro glog_library() that the BUILD file can load.

    By default, glog is built with gflags support.  You can change this behavior
    by using glog_library(with_gflags=0)

    This file is inspired by the following sample BUILD files:
        https://github.com/google/glog/issues/61
        https://github.com/google/glog/files/393474/BUILD.txt

    Args:
        name: The name of the rule (this is not used; it only exists to silence
            the linter).
        namespace: Namespace to use.
        with_gflags: Build with gflags support.
    """
    if native.repository_name() != "@":
        gendir = "$(GENDIR)/external/" + native.repository_name().lstrip("@")
    else:
        gendir = "$(GENDIR)"
    cc_library(
        name = "glog",
        visibility = ["//visibility:public"],
        srcs = [
            ":config_h",
            "src/base/commandlineflags.h",
            "src/base/googleinit.h",
            "src/base/mutex.h",
            "src/demangle.cc",
            "src/demangle.h",
            "src/logging.cc",
            "src/raw_logging.cc",
            "src/signalhandler.cc",
            "src/stacktrace.h",
            "src/stacktrace_generic-inl.h",
            "src/stacktrace_libunwind-inl.h",
            "src/stacktrace_powerpc-inl.h",
            "src/stacktrace_windows-inl.h",
            "src/stacktrace_x86-inl.h",
            "src/stacktrace_x86_64-inl.h",
            "src/symbolize.cc",
            "src/symbolize.h",
            "src/utilities.cc",
            "src/utilities.h",
            "src/vlog_is_on.cc",
        ],
        hdrs = [
            ":logging_h",
            ":raw_logging_h",
            ":stl_logging_h",
            ":vlog_is_on_h",
            "src/glog/log_severity.h",
        ],
        strip_include_prefix = "src",
        copts = [
            # Disable warnings that exists in glog.
            "-Wno-sign-compare",
            "-Wno-unused-function",
            "-Wno-unused-local-typedefs",
            "-Wno-unused-variable",
            "-DGLOG_BAZEL_BUILD",
            # Inject a C++ namespace.
            "-DGOOGLE_NAMESPACE='%s'" % namespace,
            # Allows src/base/mutex.h to include pthread.h.
            "-DHAVE_PTHREAD",
            # Allows src/logging.cc to determine the host name.
            "-DHAVE_SYS_UTSNAME_H",
            # For src/utilities.cc.
            "-DHAVE_SYS_SYSCALL_H",
            "-DHAVE_SYS_TIME_H",
            "-DHAVE_STDINT_H",
            "-DHAVE_STRING_H",
            # Enable dumping stacktrace upon sigaction.
            "-DHAVE_SIGACTION",
            # For logging.cc.
            "-DHAVE_PREAD",
            "-DHAVE___ATTRIBUTE__",

            # Enable UNISTD_H for symlinking.
            "-DHAVE_UNISTD_H",

            # For stacktrace dumping.
            "-DHAVE_UNWIND_H",

            # Include generated header files.
            "-I%s/glog_internal" % gendir,
        ] + ([
            # Use gflags to parse CLI arguments.
            "-DHAVE_LIB_GFLAGS",
        ] if with_gflags else []),
        deps = [
            "@com_github_gflags_gflags//:gflags",
        ] if with_gflags else [],
    )

    native.genrule(
        name = "gen_sh",
        outs = [
            "gen.sh",
        ],
        cmd = r'''\
#!/bin/sh
cat > $@ <<"EOF"
sed -e 's/@ac_cv_cxx_using_operator@/1/g' \
    -e 's/@ac_cv_have_unistd_h@/1/g' \
    -e 's/@ac_cv_have_stdint_h@/1/g' \
    -e 's/@ac_cv_have_systypes_h@/1/g' \
    -e 's/@ac_cv_have_libgflags_h@/1/g' \
    -e 's/@ac_cv_have_uint16_t@/1/g' \
    -e 's/@ac_cv_have___builtin_expect@/1/g' \
    -e 's/@ac_cv_have_.*@/0/g' \
    -e 's/@ac_google_start_namespace@/namespace google {/g' \
    -e 's/@ac_google_end_namespace@/}/g' \
    -e 's/@ac_google_namespace@/google/g' \
    -e 's/@ac_cv___attribute___noinline@/__attribute__((noinline))/g' \
    -e 's/@ac_cv___attribute___noreturn@/__attribute__((noreturn))/g' \
    -e 's/@ac_cv___attribute___printf_4_5@/__attribute__((__format__ (__printf__, 4, 5)))/g'
EOF
''',
    )

    native.genrule(
        name = "config_h",
        srcs = [
            "src/config.h.cmake.in",
        ],
        outs = [
            "glog_internal/config.h",
        ],
        cmd = "awk '{ gsub(/^#cmakedefine/, \"//cmakedefine\"); print; }' $< > $@",
    )

    for f in ["vlog_is_on", "stl_logging", "raw_logging", "logging"]:
        native.genrule(
            name = "%s_h" % f,
            srcs = [
                "src/glog/%s.h.in" % f,
            ],
            outs = [
                "src/glog/%s.h" % f,
            ],
            cmd = "$(location :gen_sh) < $< > $@",
            tools = [":gen_sh"],
        )


================================================
FILE: third_party/junction/BUILD.bazel
================================================
load("@rules_foreign_cc//foreign_cc:cmake.bzl", "cmake")

filegroup(
    name = "all_srcs",
    srcs = glob(["**"]),
    visibility = ["//visibility:public"],
)

cmake(
    name = "libjunction",
    lib_source = ":all_srcs",
    data = [ "@com_github_preshing_turf//:all_srcs"],
    visibility = ["//visibility:public"],
    out_static_libs = ["libjunction.a", "libturf.a"],
)


================================================
FILE: third_party/junction/junction.patch
================================================
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 93cf495..686aa50 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,8 +30,9 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules")
 
 # Add turf targets and import its macros since we use them below
 get_filename_component(outerPath "${CMAKE_CURRENT_LIST_DIR}/.." ABSOLUTE)
-set(TURF_ROOT "${outerPath}/turf" CACHE STRING "Path to Turf")
+set(TURF_ROOT "${outerPath}/com_github_preshing_turf" CACHE STRING "Path to Turf")
 include("${TURF_ROOT}/cmake/Macros.cmake")
+message(TURF_ROOT="${TURF_ROOT}")
 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
     # If this is the root project, apply build settings here so that
     # they're applied to all targets

================================================
FILE: third_party/libev/BUILD.bazel
================================================
load("@rules_foreign_cc//foreign_cc:configure.bzl", "configure_make")

filegroup(
    name = "all_srcs",
    srcs = glob(["**"]),
    visibility = ["//visibility:public"],
)

configure_make(
    name = "libev",
    lib_source = ":all_srcs",
    visibility = ["//visibility:public"],
)


================================================
FILE: third_party/openssl/BUILD.bazel
================================================
load("@rules_foreign_cc//foreign_cc:configure.bzl", "configure_make")

filegroup(
    name = "all_srcs",
    srcs = glob(["**"]),
    visibility = ["//visibility:public"],
)

configure_make(
    name = "openssl",
    configure_command = "config",
    configure_options = [
        "no-shared",
    ],
    lib_source = ":all_srcs",
    out_static_libs = [
        "libssl.a",
        "libcrypto.a",
    ],
    visibility = ["//visibility:public"],

)


================================================
FILE: third_party/turf/BUILD.bazel
================================================
load("@rules_foreign_cc//foreign_cc:cmake.bzl", "cmake")

filegroup(
    name = "all_srcs",
    srcs = glob(["**"]),
    visibility = ["//visibility:public"],
)


================================================
FILE: ttcs-agent.cfg
================================================
management_address: "10.128.2.13"
log_dir: "/var/opt/ttcs/log"
subscription_mode: true
coordinator_address: "c-gjk1994gjk1994-c89e.gcp.clockwork.io"
coordinator_subscription_service_port: 6176
probe_address: "10.128.2.13"
clock_quality: 1
correct_clock: true