Repository: ease-lab/Hermes
Branch: master
Commit: 949229c23881
Files: 56
Total size: 412.5 KB

Directory structure:
gitextract_es11wv71/

├── .clang-format
├── .gitignore
├── AUTHORS
├── CMakeLists.txt
├── LICENSE
├── README.md
├── bin/
│   ├── copy-exec-files.sh
│   ├── copy-n-exec-hermesKV.sh
│   ├── copy-n-exec-rCRAQ.sh
│   ├── copy-traces.sh
│   ├── csv_latency_parser.py
│   ├── exec-derecho.sh
│   ├── format.sh
│   ├── get-system-xput-files.sh
│   ├── setup.sh
│   └── trace-spliter.sh
├── exec/
│   ├── Makefile
│   ├── hosts.sh
│   ├── results/
│   │   ├── latency/
│   │   │   └── .gitinclude
│   │   └── xput/
│   │       ├── all-nodes/
│   │       │   └── .gitkeep
│   │       └── per-node/
│   │           └── .gitkeep
│   ├── run-hades.sh
│   ├── run-hermesKV.sh
│   ├── run-rCRAQ.sh
│   └── run.sh
├── include/
│   ├── hades/
│   │   └── hades.h
│   ├── hermes/
│   │   ├── config.h
│   │   ├── inline-util.h
│   │   ├── spacetime.h
│   │   └── util.h
│   ├── mica-herd/
│   │   ├── city.h
│   │   ├── hrd.h
│   │   ├── mica.h
│   │   └── sizes.h
│   ├── utils/
│   │   ├── bit_vector.h
│   │   ├── concur_ctrl.h
│   │   └── time_rdtsc.h
│   └── wings/
│       ├── wings.h
│       └── wings_api.h
├── src/
│   ├── CR/
│   │   ├── crKV.c
│   │   └── cr_worker.c
│   ├── hades/
│   │   ├── hades.c
│   │   └── test.c
│   ├── hermes/
│   │   ├── hermesKV.c
│   │   ├── hermes_worker.c
│   │   ├── main.c
│   │   ├── spacetime.c
│   │   ├── stats.c
│   │   └── util.c
│   ├── mica-herd/
│   │   ├── city.c
│   │   ├── herd.c
│   │   └── mica.c
│   └── wings/
│       └── wings.c
└── tla/
    ├── Hermes.tla
    ├── HermesRMWs.tla
    └── README.md

================================================
FILE CONTENTS
================================================

================================================
FILE: .clang-format
================================================
﻿---
BasedOnStyle: Chromium
AlignAfterOpenBracket: Align
AlignConsecutiveDeclarations: 'false'
AlignEscapedNewlines: Left
AlignOperands: 'true'
AllowShortFunctionsOnASingleLine: All
AllowShortIfStatementsOnASingleLine: WithoutElse
AlwaysBreakAfterDefinitionReturnType: TopLevel
AlwaysBreakTemplateDeclarations: 'Yes'
BinPackArguments: 'true'
BinPackParameters: 'true'
BreakBeforeBraces: WebKit
CompactNamespaces: 'false'
Cpp11BracedListStyle: 'true'
IndentWrappedFunctionNames: 'false'
Language: Cpp
NamespaceIndentation: None
SpaceAfterTemplateKeyword: 'true'
SpaceBeforeAssignmentOperators: 'true'
SpaceBeforeCpp11BracedList: 'true'
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: 'false'
SpacesInAngles: 'false'
SpacesInParentheses: 'false'
SpacesInSquareBrackets: 'false'
UseTab: Never

...


================================================
FILE: .gitignore
================================================
# ignore temporary files
.*.swp
\#*#
*.pyc
*.o
*.hi
*.dump
*.log
*.rej
*.orig
*.patch
*.diff
.tags*

# ignore executables
/src/mica/test
/src/libhrd/main
/src/herd-hybrid/main
/src/herd-UD/main
src/Armonia/main
/src/CR/cr
/src/hermes/hermes
/src/hades/hades
/src/hermes/hermes-wings

# ignore debug files
/debug/*.txt
# ignore traces
/traces/*.txt
# ignore ide files
/.idea/
/cmake-build-debug/
/src/cmake-build-debug/
/src/.idea/
/src/cache/cmake-build-debug/
/src/cache/.idea/
/src/Armonia/armonia-ec
/src/Armonia/armonia-sc
/src/Armonia/throughput.txt
/src/herd-UD/throughput.txt
/bin/traces
#/results/*
/exec/results/*.txt
/exec/results/xput/*.txt
/exec/results/xput/*.csv
/exec/results/xput/per-node/*.csv
/exec/results/xput/per-node/*.txt
/exec/results/xput/all-nodes/*.txt
/exec/results/latency/*.txt
/exec/results/latency/*.csv
/results/*.txt
/results/xput/*.txt
/results/xput/*.csv
/results/xput/per-node/*.csv
/results/xput/per-node/*.txt
/results/xput/all-nodes/*.txt
/results/latency/*.txt
/results/latency/*.csv
traces/trace-parts/*
/results/scattered-results/*
/results/aggregated-system-results/*.csv
/traces/system-traces/*.txt
/traces/current-splited-traces/*.txt
/traces/*.txt
traces/
./exec/hermesKV
./exec/rCRAQ
./exec/hades


================================================
FILE: AUTHORS
================================================
Run `git shortlog -se` for an up-to-date list of contributors.
---

Principal authors: Antonios Katsarakis  <antonios.katsarakis AT ed.ac.uk>
		   Vasilis  Gavrielatos <vasilis.gavrielatos AT ed.ac.uk>


================================================
FILE: CMakeLists.txt
================================================
######################################################################################
# WARNING: DO NOT MAKE through cmake use the Makefile in /exec/ to compile instead!!!!
######################################################################################

cmake_minimum_required(VERSION 2.8.12)
project(hermes)

set(Hermes_VERSION_MAJOR 1)
set(Hermes_VERSION_MINOR 0)

include_directories(include/hermes
                    include/libhrd
                    /usr/include/
                    include/optik
        include/mica-herd)

set(CMAKE_C_STANDARD 11)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")

set(SOURCE_FILES_cr
        #Source files
        src/CR/cr_worker.c

        src/wings/wings.c

        src/hermes/main.c
        src/hermes/stats.c
        src/hermes/spacetime.c

        src/mica-herd/mica.c
        src/mica-herd/city.c
        src/mica-herd/herd.c


        ##### header files ####
        include/wings/wings.h
        include/wings/wings_api.h

        include/mica-herd/city.h
        include/mica-herd/hrd.h
        include/mica-herd/sizes.h

        include/hermes/util.h
        include/hermes/config.h
        include/utils/bit_vector.h
        include/utils/concur_ctrl.h
        src/CR/crKV.c)


set(SOURCE_FILES_hades
        #Source files
        src/wings/wings.c
        src/hades/hades.c

        ##### header files ####
        include/wings/wings_api.h
        include/wings/wings.h
        include/hades/hades.h
        src/hades/test.c)


set(SOURCE_FILES_hermes
        #Source files
        src/hermes/main.c
        src/hermes/util.c
#        src/hermes/worker.c
        src/hermes/hermes_worker.c
        src/hermes/stats.c
        src/hermes/spacetime.c
        src/mica-herd/herd.c
        src/mica-herd/mica.c
        src/mica-herd/city.c

        src/wings/wings.c

        ##### header files ####
        include/mica-herd/hrd.h
        include/mica-herd/city.h
        include/mica-herd/sizes.h

        include/hermes/util.h
        include/hermes/config.h
        include/utils/concur_ctrl.h
        include/utils/bit_vector.h
        include/hades/hades.h
        include/wings/wings.h
        include/wings/wings_api.h src/hermes/hermesKV.c)


add_executable(cr ${SOURCE_FILES_cr})
add_executable(hades ${SOURCE_FILES_hades})
add_executable(hermes ${SOURCE_FILES_hermes})
target_link_libraries(cr pthread ibverbs rt memcached numa rdmacm)
target_link_libraries(hades pthread ibverbs rt memcached numa rdmacm)
target_link_libraries(hermes pthread ibverbs rt memcached numa rdmacm)


================================================
FILE: LICENSE
================================================

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# Hermes Reliable Replication Protocol

<img align="left" height="160" src="https://github.com/akatsarakis/Hermes/blob/master/hermes-logo.png">

This is the publicly available artifact repository supporting the ASPLOS'20 paper [_"Hermes: A Fast, Fault-Tolerant and Linearizable Replication Protocol"_](http://hermes-protocol.com "Hermes Arxiv version"). The repository contains both code to experimentally evaluate Hermes(KV) and complete Hermes TLA+ specifications which can be used to verify Hermes correctness via model-checking.

[![top picks](https://badgen.net/badge/honorable%20mention/top%20picks%20'20/d99e14)](https://www.sigarch.org/call-contributions/ieee-micro-top-picks/)
[![available](https://badgen.net/badge/acm%20badge/available/117c00)](https://www.acm.org/publications/policies/artifact-review-badging#available)
[![functional](https://badgen.net/badge/acm%20badge/functional/FB1f44)](https://www.acm.org/publications/policies/artifact-review-badging#functional)
[![stars](https://badgen.net/github/stars/ease-lab/Hermes)]()

[![license](https://badgen.net/badge/webpage/Hermes/blue)](http://hermes-protocol.com/)
[![license](https://badgen.net/badge/license/Apache%202.0/blue)](https://github.com/ease-lab/Hermes/blob/master/LICENSE)
[![last commit](https://badgen.net/github/last-commit/ease-lab/Hermes)]()
<a href="https://twitter.com/intent/follow?screen_name=ease_lab" target="_blank">
<img src="https://img.shields.io/twitter/follow/ease_lab?style=social&logo=twitter" alt="follow on Twitter"></a>


## Citation
```
@inbook{Katsarakis:20,
author = {Katsarakis, Antonios and Gavrielatos, Vasilis and Katebzadeh, M.R. Siavash and Joshi, Arpit and Dragojevic, Aleksandar and Grot, Boris and Nagarajan, Vijay},
title = {Hermes: A Fast, Fault-Tolerant and Linearizable Replication Protocol},
year = {2020},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
booktitle = {Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems},
pages = {201–217},
numpages = {17}
}
```

----
## High Perfomance Features
- _Reads_: i) Local ii) Load-balanced (served by any replica)
- _Updates (Writes and RMWs)_: i) Inter-key concurrent ii) Decentralized iii) Fast (1rtt commit -- any replica)
- _Writes_: iv) Non-conflicting (i.e., never abort)

## Consistency and Properties
Linearizable reads, writes and RMWs with the following properties:
1. _Writes_: from a live replica _always commit_ after Invalidating (and getting acknowledgments from) the rest live replicas. 
1. _RMWs_: at most one of possible concurrent RMWs to a key can commit, and this only once all acknowledgments from live replicas are gathered.
1. _Reads_: return the local value if the targeted keys are found in the Valid state and the coordinator was considered live at the time of reading. The later can be ensured locally if the coordinator has a lease for (and is part of) the membership.

## Fault Tolerance
Coupling Invalidations with per-key logical timestamps (i.e., Lamport clocks) and propagating the value to be updated with the invalidation message (_early value propagation_), Hermes allows any replica blocked by an update (write or RMW) to safely replay the update and unblock it self and the rest of followers.

----

## Hardware dependencies

A homogeneous cluster of x86_64 nodes interconnected via RDMA network cards and switched 
(tested on "Mellanox ConnectX-4" Infiniband infrastructure).


## Software requirements

Linux OS (tested on Ubuntu 18.04 4.15.0-55-generic) with root access.

The software is tested using the following version of Mellanox OFED RDMA drivers
`MLNX_OFED_LINUX-4.4-2.0.7.0`.

Third-party libraries that you will require to run the experiments include:
1. _parallel_ (Cluster management scripts only)
1. _libmemcached-dev_ (used to exchange QP informations for the setup of RDMA connections)
1. _libnuma-dev_	(for mbind)


## Setup

On every node:
1. Install Mellanox OFED ibverbs drivers
1. `./hermes/bin/setup.sh`

On manager (just pick on node in the cluster):
1. Fill variables in `/hermes/exec/hosts.sh`
1. Configure setup and default parameters in `/hermes/include/hermes/config.h`
1. From `/hermes/exec/` compile _hermesKV_ through make
1. scp  _hermesKV_ and the configured hosts.sh in the `/hermes/exec/` directory of all other nodes in the cluster. 


## Compilation

`cd hermes/exec; make`

_Warning_: Do not compile through cmake; instead use the Makefile in exec/ directory.


## Run

Run first on manager:
`./run-hermesKV.sh <experiment_parameters>`

Then run on all other member nodes 
`./run-hermesKV.sh <experiment_parameters>`

> Note that some members will eagerly terminate if experiment 
  uses smaller number of nodes than specified in hosts.sh
  
An experiment example for three nodes 12 worker threads and 35% write ratio would be as follows:
`./run-hermesKV.sh -W 12 -w 350 -M 3`
Supported command-line arguments for the experiments are detailed in the run-hermesKV.sh script.


---
## Acknowledgments
 Hermes is based on [HERD/MICA](https://github.com/efficient/HERD "Apache 2.0") design as an underlying KVS, the code of which we have adapted to implement HermesKV.

## Other Implementations of Hermes

- [Odyssey](https://github.com/vasigavr1/Odyssey) - Hermes is also implemed in the Odyssey framework by [Vasilis Gavrielatos](https://github.com/vasigavr1)
- [Olympus](https://github.com/sadraskol/olympus) - in Rust by [Thomas Bracher](https://twitter.com/sadraskol)


## Contact
 Antonios Katsarakis: <a href="http://antonis.io/" title="Personal webpage" target="_blank">`antonis.io`</a> |  [`antoniskatsarakis@yahoo.com`](mailto:antoniskatsarakis@yahoo.com?subject=[GitHub]%20Zeus%20Specification "Email")


================================================
FILE: bin/copy-exec-files.sh
================================================
#!/usr/bin/env bash

FILES_TO_CPY=(
        "hosts.sh"
        "run.sh"
        "run-hermesKV.sh"
        "hermesKV"
        "run-rCRAQ.sh"
        "rCRAQ"
#        "hades"
#        "run-hades.sh"
      )

EXEC_FOLDER="${HOME}/hermes/exec"

cd $EXEC_FOLDER
# get Hosts
source ../exec/hosts.sh
make clean; make
cd -

for FILE in "${FILES_TO_CPY[@]}"
do
	parallel scp ${EXEC_FOLDER}/${FILE} {}:${EXEC_FOLDER}/${FILE} ::: $(echo ${REMOTE_HOSTS[@]})
	echo "${FILE} copied to {${REMOTE_HOSTS[@]}}"
done


================================================
FILE: bin/copy-n-exec-hermesKV.sh
================================================
#!/usr/bin/env bash

### Runs to make
#declare -a write_ratios=(0 10 50 200 500 1000)
declare -a write_ratios=(1000)
declare -a rmw_ratios=(0)
#declare -a num_workers=(5 10 15 20 25 30 36)
declare -a num_workers=(1)
#declare -a batch_sizes=(25 50 75 100 125 150 200 250)
declare -a batch_sizes=(50)
declare -a credits=(50)
#declare -a coalesce=(1 5 10 15)
declare -a coalesce=(15)
#declare -a num_machines=(2 3 5 7)
declare -a num_machines=(5)

# Set LAT_WORKER to -1 to disable latency measurement or to worker id (i.e., from 0 up to [num-worker - 1])
LAT_WORKER="-1"
#LAT_WORKER="0"

EXEC_FOLDER="${HOME}/hermes/exec"

REMOTE_COMMAND="cd ${EXEC_FOLDER}; bash run-hermesKV.sh"

PASS="${1}"
if [ -z "$PASS" ]
then
      echo "\$PASS is empty! --> sudo pass for remotes is expected to be the first arg"
      exit;
fi

echo "\$PASS is OK!"
cd ${EXEC_FOLDER}

# get Hosts
source ./hosts.sh

../bin/copy-exec-files.sh

      # Execute locally and remotely
for M in "${num_machines[@]}"; do
    for RMW in "${rmw_ratios[@]}"; do
      for WR in "${write_ratios[@]}"; do
        for W in "${num_workers[@]}"; do
          for BA in "${batch_sizes[@]}"; do
            for CRD in "${credits[@]}"; do
              for COAL in "${coalesce[@]}"; do
                 args=" -M ${M} -R ${RMW} -w ${WR} -W ${W} -b ${BA} -c ${CRD} -C ${COAL} -l ${LAT_WORKER}"
                 echo ${PASS} | ./run-hermesKV.sh ${args} &
                 sleep 2 # give some leeway so that manager starts before executing the members
	             parallel "echo ${PASS} | ssh -tt {} $'${REMOTE_COMMAND} ${args}'" ::: $(echo ${REMOTE_HOSTS[@]}) >/dev/null
	          done
	        done
	      done
	    done
	  done
	done
done

cd - >/dev/null

../bin/get-system-xput-files.sh


================================================
FILE: bin/copy-n-exec-rCRAQ.sh
================================================
#!/usr/bin/env bash

USE_SAME_BATCH_N_CREDITS=0

### Runs to make
declare -a write_ratios=(1000)
#declare -a num_workers=(5 10 15 20 25 30 36)
declare -a num_workers=(1)
#declare -a batch_sizes=(25 50 75 100 125 150 200 250)
declare -a batch_sizes=(50)
declare -a credits=(15) # WARNING credits for CR must be divided by the num_machines (i.e., credits % num_machines == 0)
#declare -a coalesce=(1 5 10 15)
declare -a coalesce=(10)
#declare -a num_machines=(2 3 5 7)
declare -a num_machines=(3)

# Set LAT_WORKER to -1 to disable latency measurement or to worker id (i.e., from 0 up to [num-worker - 1])
LAT_WORKER="-1"
#LAT_WORKER="0"

#LOCAL_HOST=`hostname`
EXEC_FOLDER="${HOME}/hermes/exec"
REMOTE_COMMAND="cd ${EXEC_FOLDER}; bash run-rCRAQ.sh"

PASS="${1}"
if [ -z "$PASS" ]
then
      echo "\$PASS is empty! --> sudo pass for remotes is expected to be the first arg"
      exit;
fi

echo "\$PASS is OK!"
cd ${EXEC_FOLDER}

# get Hosts
source ./hosts.sh

../bin/copy-exec-files.sh

if [ ${USE_SAME_BATCH_N_CREDITS} -eq 0 ]
then
   for M in "${num_machines[@]}"; do
       # Execute locally and remotely
       for WR in "${write_ratios[@]}"; do
        for W in "${num_workers[@]}"; do
          for BA in "${batch_sizes[@]}"; do
            for CRD in "${credits[@]}"; do
              for COAL in "${coalesce[@]}"; do
                 args=" -M ${M} -w ${WR} -W ${W} -b ${BA} -c ${CRD} -C ${COAL} -l ${LAT_WORKER}"
                 echo ${PASS} | ./run-rCRAQ.sh ${args} &
                 sleep 2
	             parallel "echo ${PASS} | ssh -tt {} $'${REMOTE_COMMAND} ${args}'" ::: $(echo ${REMOTE_HOSTS[@]}) >/dev/null
	          done
	        done
	      done
	    done
	   done
   done

else
       # Execute locally and remotely
   for M in "${num_machines[@]}"; do
       for WR in "${write_ratios[@]}"; do
        for W in "${num_workers[@]}"; do
          for BA in "${batch_sizes[@]}"; do
              for COAL in "${coalesce[@]}"; do
                 args=" -M ${M} -w ${WR} -W ${W} -b ${BA} -c ${BA} -C ${COAL} -l ${LAT_WORKER}"
                 echo ${PASS} | ./run-rCRAQ.sh ${args} &
                 sleep 2
	             parallel "echo ${PASS} | ssh -tt {} $'${REMOTE_COMMAND} ${args}'" ::: $(echo ${REMOTE_HOSTS[@]}) >/dev/null
	          done
	        done
	      done
	    done
   done
fi

cd - >/dev/null

../bin/get-system-xput-files.sh


================================================
FILE: bin/copy-traces.sh
================================================
#!/usr/bin/env bash

# Copy (per-thread splitted) trace folder
FOLDERS_TO_CPY=( "traces/current-splitted-traces" )
HOME_FOLDER="${HOME}/hermes"

cd ${HOME_FOLDER} >/dev/null
# get Hosts
source ./exec/hosts.sh
cd - >/dev/null

for FOLDER in "${FOLDERS_TO_CPY[@]}"
do
	parallel scp -r ${HOME_FOLDER}/${FOLDER} {}:${HOME_FOLDER}/${FOLDER} ::: $(echo ${REMOTE_HOSTS[@]})
	echo "${FOLDER} copied to {${REMOTE_HOSTS[@]}}"
done


================================================
FILE: bin/csv_latency_parser.py
================================================
#!/usr/bin/python

import sys, os, ntpath, getopt

"""
========
Parser for aggregated over time results
========
"""
class LatencyParser:
    def __init__(self):
        self.latency_values = []
        self.reads = []
        self.max_read_latency = 0
        self.max_write_latency = 0
        self.writes = []
        self.all_reqs = []
        self.parseInputStats()
        self.printAllStats()
       # self.printStats(all_reqs)

    def printStats(self, array, max_latency):
        self.avgLatency(array)
        #self.percentileLatency(array, 20)
        self.percentileLatency(array, 50)
        self.percentileLatency(array, 90)
        self.percentileLatency(array, 95)
        self.percentileLatency(array, 99)
        #self.percentileLatency(array, 99.9)
        #self.percentileLatency(array, 99.99)
        #self.percentileLatency(array, 99.999)
        #self.percentileLatency(array, 99.9999)
        #self.percentileLatency(array, 100)
        print "Max Latency: ", max_latency, "us"

    def printAllStats(self):
        print "~~~~~~ Write Stats ~~~~~~~"
        self.printStats(self.writes, self.max_write_latency)
        print "\n~~~~~~ Read Stats ~~~~~~~~"
        self.printStats(self.reads, self.max_read_latency)
        print "\n~~~~~~ Overall Stats ~~~~~~~~~"
        self.printStats(self.all_reqs, max(self.max_read_latency, self.max_write_latency))


    def avgLatency(self, array):
        cummulative = 0 
        total_reqs = 0 
        for x in xrange(len(self.latency_values)):
            cummulative = self.latency_values[x] * array[x] + cummulative 
            total_reqs += array[x]
        if total_reqs > 0:
            print "Reqs measured: ", total_reqs, "| Avg Latency: ", cummulative / total_reqs
        else:
            print "No reqs measured"

    def percentileLatency(self, array, percentage):
        total_reqs = 0
        sum_reqs = 0
        for x in xrange(len(self.latency_values)):
            #cummulative = self.latency_values[x] * array[x] + cummulative 
            total_reqs += array[x]
        if total_reqs > 0:
            if percentage == 100:
                for x in reversed(xrange(len(self.latency_values))):
                    if array[x] > 0:
                        if self.latency_values[x] == -1:
                            print percentage, "%: >", self.latency_values[x-1], "us"
                        else:
                            print percentage, "%: ", self.latency_values[x], "us"
                    return
            else:
                for x in xrange(len(self.latency_values)):
                    sum_reqs += array[x]
                    if ((100.0 * sum_reqs) / total_reqs) >= percentage:
                        if self.latency_values[x] == -1:
                            print percentage, "%: >", self.latency_values[x-1], "us"
                        else:
                            print percentage, "% : ", self.latency_values[x], "us"
                        return
        else:
            print "No reqs measured"

    def parseInputStats(self):
        lr_lines = 0
        for line in sys.stdin:                  # input from standard input
            if line[0] == '#':
                continue
            (command, words) = line.strip().split(":",1)
            command = command.strip()
            if command == 'reads':
                words = words.strip().split(",")
                #if int(words[0].strip()) != -1:
                self.latency_values.append(int(words[0].strip()))
                self.reads.append(int(words[1].strip()))
                self.all_reqs.append(int(words[1].strip()))
            elif command == 'writes':
                words = words.strip().split(",")
                self.writes.append(int(words[1].strip()))
                self.all_reqs[lr_lines] = self.all_reqs[lr_lines] + self.writes[-1]
                lr_lines = lr_lines + 1
            elif command == 'reads-hl':
                words = words.strip().split(",")
                self.max_read_latency = int(words[0].strip())
            elif command == 'writes-hl':
                words = words.strip().split(",")
                self.max_write_latency = int(words[0].strip())

if __name__ == '__main__':
    LatencyParser()


================================================
FILE: bin/exec-derecho.sh
================================================
#!/usr/bin/env bash
HOSTS=( ##### network  cluster #####
         "houston"
         "sanantonio"
         "austin"
         "indianapolis"
         "philly"
#         "atlanta"
         ##### compute cluster #####
#         "baltimore"
#         "chicago"
#         "detroit"
        )

NUM_NODES=5
NUM_SENDERS=0 #0 - all senders, 1 - half senders, 2 - one sender
REQS_PER_SENDER=10000000

### Runs to make
#declare -a delivery_mode=(0 1) #0 - ordered mode, 1 - unordered mode
#declare -a object_size=(40 1024)
#declare -a window_size=(128 256)
declare -a delivery_mode=(0) #0 - ordered mode, 1 - unordered mode
declare -a object_size=(256 1024)
declare -a window_size=(128 256)
declare -a iterations=(1 2 3 4) #(1 2 3) for 3 iterations

if [[ $NUM_NODES -ne ${#HOSTS[@]} ]] ; then
    echo "Num_nodes($NUM_NODES) !=  #Hosts(${#HOSTS[@]})"
    exit 1
fi

LOCAL_HOST=`hostname`
HOME_FOLDER="${HOME}/derecho-unified/Release/applications/tests/performance_tests/"
#pin derecho threads to cores (w/o using hyperthreads) of numa node 0
COMMAND_NO_ARGS="taskset -c 0,2,4,6,8,10,12,14,16,18 ./bandwidth_test "

total_iters=0
cd ${HOME_FOLDER} >/dev/null
# Execute locally and remotely
for del_mode in "${delivery_mode[@]}"; do
  for obj_size in "${object_size[@]}"; do
    for win_size in "${window_size[@]}"; do
        for iter in "${iterations[@]}"; do
	        total_iters=$((total_iters + 1))

            args="--DERECHO/max_payload_size=${obj_size} --DERECHO/window_size=${win_size} -- ${NUM_NODES} ${NUM_SENDERS} ${REQS_PER_SENDER} ${del_mode}"
            COMMAND=" ${COMMAND_NO_ARGS} ${args}"

            echo "Running Derecho with: delivery_mode:${del_mode} obj size: $obj_size, window_size: $win_size nodes: $NUM_NODES "
            ${COMMAND} >/dev/null &
            sleep 1

	        parallel "ssh -tt {} $'cd ${HOME_FOLDER}; ${COMMAND}'" ::: $(echo ${HOSTS[@]/$LOCAL_HOST}) >/dev/null
	        sleep 9 # give local node some leeway to log the results into a file
        done
    done
  done
done
tail -${total_iters} data_derecho_bw
cd - >/dev/null


================================================
FILE: bin/format.sh
================================================
#!/bin/bash

SCRIPT_DIR="$(dirname "$0")"
cd "${SCRIPT_DIR}"

FORMAT_FILES_IN_DIRECTORIES="../src/ ../include/"

clang-format --version > /dev/null || exit 1

if [ "$1" = "check" ]; then # Check clang-format has been applied!
  find ${FORMAT_FILES_IN_DIRECTORIES} \
    -regex '.*\.\(cpp\|hpp\|cc\|cxx\)' \
    -exec clang-format -style=file -output-replacements-xml -i {} \; |
    grep -c "<replacement " >/dev/null

  if [ $? -ne 1 ]; then
    echo "Format check: Failed!"
    echo " -- Files do not match clang-format. Run bin/format.sh before adding files to git!"
    exit 1
  else
    echo "Format check: Passed!"
  fi

else # Apply clang-format to all files

  find ${FORMAT_FILES_IN_DIRECTORIES} \
    -regex '.*\.\(c\|h\|cpp\|hpp\|cc\|cxx\)' \
    -exec clang-format -style=file -i {} \;
fi


================================================
FILE: bin/get-system-xput-files.sh
================================================
#!/usr/bin/env bash

EXEC_FOLDER="${HOME}/hermes/exec"
RESULTS_FOLDER="${HOME}/hermes/exec/results"

RESULT_FOLDER="${RESULTS_FOLDER}/xput/per-node/"
RESULT_OUT_FOLDER="${RESULTS_FOLDER}/xput/per-node/"
RESULT_OUT_FOLDER_MERGE="${RESULTS_FOLDER}/xput/all-nodes/"

cd ${EXEC_FOLDER} >/dev/null
# get Hosts
source ./hosts.sh
cd - >/dev/null

# Gather remote files
parallel "scp {}:${RESULT_FOLDER}* ${RESULT_OUT_FOLDER} " ::: $(echo ${REMOTE_HOSTS[@]})
echo "xPut result files copied from: {${REMOTE_HOSTS}}"

# group all files
ls ${RESULT_OUT_FOLDER} | awk -F '-' '!x[$2]++{print $1}' | while read -r line; do
    # Create an intermediate file print the 3rd line for all files with the same prefix to the same file
    awk 'FNR==3 {print $0}' ${RESULT_OUT_FOLDER}/$line* > ${RESULT_OUT_FOLDER_MERGE}/$line-inter.txt
          #   Sum up the xPut of the (3rd iteration) from every node to create the final file
    awk -F ':' '{sum += $2} END {print sum}' ${RESULT_OUT_FOLDER_MERGE}/$line-inter.txt > ${RESULT_OUT_FOLDER_MERGE}/$line.txt
    rm -rf  ${RESULT_OUT_FOLDER_MERGE}/$line-inter.txt
done

echo "System-wide xPut results produced in ${RESULT_OUT_FOLDER_MERGE} directory!"


================================================
FILE: bin/setup.sh
================================================
#!/usr/bin/env bash
# Exec this script in every cluster node after you have
# installed the (Infiniband) Verbs drivers through Mellanox OFED:
# 1. Download the MLNX_OFED (tested on --> MLNX_OFED_LINUX-4.4-2.0.7.0-ubuntu18.04-x86_64)
#    https://www.mellanox.com/page/products_dyn?product_family=26
# 2. tar -xvf the tar file
# 3. install through --> sudo ./mlnxofedinstall

if ! [ -x "$(command -v ofed_info)" ]; then
    echo "Error: mellanox ofed is not installed." >&2
    echo " Please install the (Infiniband) Verbs drivers through Mellanox OFED by:"
    echo "  1. Download the MLNX_OFED (tested on --> MLNX_OFED_LINUX-4.4-2.0.7.0-ubuntu18.04-x86_64)"
    echo "     https://www.mellanox.com/page/products_dyn?product_family=26"
    echo "  2. tar -xvf the tar file"
    echo "  3. install through --> sudo ./mlnxofedinstall"
    exit 1
else
    MLNX_OFED_VERSION=`ofed_info | head -1`
    echo "Running OFED driver version: ${MLNX_OFED_VERSION}" >&2
fi

# Install required Libraries (memcached is used to setup RDMA connection and numa for mbind)
sudo apt --yes install libmemcached-dev libnuma-dev memcached

# start a subnet manager
sudo /etc/init.d/opensmd start # there must be at least one subnet-manager in an infiniband subnet cluster
# start the driver
sudo /etc/init.d/openibd start

# Configure (2MB) huge-pages for the KVS
# Note that such a huge page allocation is not permanent and must be re-applied after a node reboot.
#echo 8192 | sudo tee /sys/devices/system/node/node*/hugepages/hugepages-2048kB/nr_hugepages
echo 4096 | sudo tee /sys/devices/system/node/node*/hugepages/hugepages-2048kB/nr_hugepages
echo 10000000001 | sudo tee /proc/sys/kernel/shmmax
echo 10000000001 | sudo tee /proc/sys/kernel/shmall


================================================
FILE: bin/trace-spliter.sh
================================================
#!/usr/bin/env bash

INPUT_DIR="${HOME}/hermes/traces/system-traces/"
INPUT_FILENAME="simple_trace_w_100000000_k_1000000_a_0.99.txt"
OUTPUT_DIR="${HOME}/hermes/traces/current-splited-traces/"
OUTPUT_PREFIX="t_"
OUTPUT_SUFFIX="_a_0.99.txt"

MAX_NUM_NODES=10
MAX_THREADS_PER_NODE=40


CHUNKS=$(expr ${MAX_NUM_NODES} \* ${MAX_THREADS_PER_NODE})
LINES=$(wc -l ${INPUT_DIR}/${INPUT_FILENAME} | cut -d ' ' -f1)

echo "Splitting trace with $LINES lines into $CHUNKS (per-thread) chunks ..."

split -l  $(expr ${LINES} / ${CHUNKS}) \
      -a 4 -d \
      --additional-suffix=${OUTPUT_SUFFIX} \
      ${INPUT_DIR}/${INPUT_FILENAME} \
      ${OUTPUT_DIR}/${OUTPUT_PREFIX}


================================================
FILE: exec/Makefile
================================================
CPPFLAGS  := -O3 #-Wno-unused-result -Wall -Werror
LD      := gcc -O3 -flto
LDFLAGS := ${LDFLAGS} -libverbs -lrt -lpthread -lmemcached -lnuma # -lrdmacm --> TODO we do not use hw multicast because it helps only on master-based patterns
CFLAGS   =  -I../include/mica-herd -I../include/hermes -I../include/wings -I../include/hades
APPS    := hermesKV rCRAQ
PROF    := -g -fno-omit-frame-pointer

all: ${APPS} clean-o

hermesKV: ../src/wings/wings.o ../src/hades/hades.o \
          ../src/mica-herd/herd.o ../src/mica-herd/mica.o ../src/mica-herd/city.o \
          ../src/hermes/main.o ../src/hermes/hermes_worker.o ../src/hermes/util.o \
          ../src/hermes/stats.o ../src/hermes/spacetime.o ../src/hermes/hermesKV.o
	${LD} -o $@ $^ ${LDFLAGS}


rCRAQ: ../src/mica-herd/herd.o ../src/mica-herd/mica.o \
       ../src/mica-herd/city.o ../src/hermes/main.o ../src/CR/cr_worker.o ../src/CR/crKV.o \
       ../src/hermes/spacetime.o ../src/hermes/util.o ../src/hermes/stats.o  ../src/wings/wings.o
	${LD} -o $@ $^ ${LDFLAGS}


hades-exec: ../src/hades/hades.o ../src/hades/test.o ../src/wings/wings.o ../src/mica-herd/herd.o
	${LD} -o hades $^ ${LDFLAGS}

hades: hades-exec clean-o

PHONY: clean
clean:
	@rm -f ../src/hermes/*.o ../src/mica-herd/*.o ../src/wings/*.o \
	      ../src/CR/*.o ../src/hades/*.o ${APPS} hades

clean-o:
	@rm -f ../src/hermes/*.o ../src/mica-herd/*.o ../src/wings/*.o \
	      ../src/CR/*.o ../src/hades/*.o

================================================
FILE: exec/hosts.sh
================================================
#!/usr/bin/env bash


ALL_IPS=(
### TO BE FILLED: Please provide all cluster IPs
    # Node w/ first IP (i.e., "manager") must run script before the rest of the nodes
    # (instantiates a memcached to setup RDMA connections)
    #
        10.0.3.1
        10.0.3.2
        10.0.3.3
        10.0.3.4
        10.0.3.5
        )

### TO BE FILLED: Modify to get the local IP of the node running the script (must be one of the cluster nodes)
LOCAL_IP=$(ip addr | grep 'state UP' -A2 | grep 'inet 10.0.3'| awk '{print $2}' | cut -f1  -d'/')
#LOCAL_IP="129.215.164.2"

### Fill the RDMA device name (the "hca_id" of the device when executing ibv_devinfo)
#NET_DEVICE_NAME="mlx5_0"
NET_DEVICE_NAME="mlx4_0"

##########################################
### NO NEED TO CHANGE BELOW THIS POINT ###
##########################################

REMOTE_IPS=${ALL_IPS[@]/$LOCAL_IP}
REMOTE_HOSTS=${ALL_IPS[@]/$LOCAL_IP}

NODE_ID=-1

for i in "${!ALL_IPS[@]}"; do
	if [  "${ALL_IPS[i]}" ==  "$LOCAL_IP" ]; then
		NODE_ID=$i
	fi
done


if [[ ${NODE_ID} == -1 ]]; then
    echo "Error Local IP: ${LOCAL_IP} n is not in ALL_IPS:"
    echo "    {${ALL_IPS[@]}}"
    exit
fi

echo "Local node id:" ${NODE_ID}


================================================
FILE: exec/results/latency/.gitinclude
================================================


================================================
FILE: exec/results/xput/all-nodes/.gitkeep
================================================


================================================
FILE: exec/results/xput/per-node/.gitkeep
================================================


================================================
FILE: exec/run-hades.sh
================================================
#!/usr/bin/env bash

source run.sh

blue "Running hades"

sudo LD_LIBRARY_PATH=/usr/local/lib/ -E \
	./hades                             \
	--machine-id ${NODE_ID}             \
	--dev-name ${NET_DEVICE_NAME}       \
	2>&1


================================================
FILE: exec/run-hermesKV.sh
================================================
#!/usr/bin/env bash

source run.sh

#### Get CLI arguments
# Use -1 for the default (#define in config.h) values if not argument is passed
CREDITS="-1"
NUM_WORKERS="-1"
WRITE_RATIO="-1"
MAX_COALESCE="-1"
MAX_BATCH_SIZE="-1"
RMW_RATIO="-1"
NUM_MACHINES="-1"
LAT_WORKER="-1"

# Each letter is an option argument, if it's followed by a collum
# it requires an argument. The first colum indicates the '\?'
# help/error command when no arguments are given
while getopts ":W:w:l:R:C:c:b:M:h" opt; do
  case $opt in
     W)
       NUM_WORKERS=$OPTARG # Number of threads: this must be smaller than MAX_WORKERS_PER_MACHINE of config.h
       ;;
     w)
       WRITE_RATIO=$OPTARG # given number is divided by 10 to give write rate % (i.e., 55 means 5.5 % writes)
       ;;
     R)
       RMW_RATIO=$OPTARG # percentage of writes to be rmws (i.e., -w 500 -R 500 means 25 % of RMWs and 25% of writes)
                         # RMW is disabled by default (no usage through the artifact) can be enabled through config.h)
       ;;
     C)
       MAX_COALESCE=$OPTARG # maximum number of readily-available messages to be "batched" in a network packet
                            # must be smaller than MTU and it is capped by MAX_REQ_COALESCE in config.h
       ;;
     c)
       CREDITS=$OPTARG      # maximum number of credits per node per thread; credits correspond to messages and not packets
                            # it is capped by MAX_CREDITS_PER_REMOTE_WORKER in config.h
       ;;
     b)
       MAX_BATCH_SIZE=$OPTARG   # amount of requests and protocol messages that can be batched to the KVS
                                # it is capped by MAX_BATCH_KVS_OPS_SIZE in config.h
       ;;
     M)
       NUM_MACHINES=$OPTARG # it is capped by MAX_MACHINE_NUM in config.h and the number of IPS as indicated in hosts.sh
       ;;
     l)
       LAT_WORKER=$OPTARG # An id of the worker who is measuring the latency
                          # if -1 Latency is disabled
                          # otherwise it is capped by running worker threads (NUM_WORKERS-1)
       ;;
     h)
      echo "Usage: -W <# workers> -w <write ratio>  (x1000 --> 10 for 1%)"
      echo "       -c <# credits> -b <max batch size> -C <max coalescing>"
      echo "       -M <# nodes>   -l <latency worker> -R <rmw ratio>"
      exit 1
      ;;
    \?)
      echo "Invalid option: -$OPTARG use -h to get info for arguments" >&2
      exit 1
      ;;
    :)
      echo "Option -$OPTARG requires an argument." >&2
      exit 1
      ;;
  esac
done


blue "Running hermes threads"
sudo LD_LIBRARY_PATH=/usr/local/lib/ -E \
    ./hermesKV                          \
	--machine-id ${NODE_ID}             \
	--is-roce 0                         \
	--dev-name ${NET_DEVICE_NAME}       \
	--num-machines ${NUM_MACHINES}      \
	--num-workers  ${NUM_WORKERS}       \
	--lat-worker   ${LAT_WORKER}        \
	--rmw-ratio    ${RMW_RATIO}         \
	--write-ratio  ${WRITE_RATIO}       \
	--credits      ${CREDITS}           \
	--max-coalesce ${MAX_COALESCE}      \
	--max-batch-size ${MAX_BATCH_SIZE}  \
	--hermes                            \
	2>&1


================================================
FILE: exec/run-rCRAQ.sh
================================================
#!/usr/bin/env bash

source run.sh


#### Get CLI arguments
# Use -1 for the default (#define in config.h) values if not argument is passed
CREDITS="-1"
NUM_WORKERS="-1"
WRITE_RATIO="-1"
MAX_COALESCE="-1"
MAX_BATCH_SIZE="-1"
RMW_RATIO="-1"
NUM_MACHINES="-1"
LAT_WORKER="-1"

# Each letter is an option argument, if it's followed by a collum
# it requires an argument. The first colum indicates the '\?'
# help/error command when no arguments are given
while getopts ":W:w:C:c:b:M:l:h" opt; do
  case $opt in
     W)
       NUM_WORKERS=$OPTARG
       ;;
     w)
       WRITE_RATIO=$OPTARG
       ;;
     C)
       MAX_COALESCE=$OPTARG
       ;;
     c)
       CREDITS=$OPTARG
       ;;
     b)
       MAX_BATCH_SIZE=$OPTARG
       ;;
     M)
       NUM_MACHINES=$OPTARG
       ;;
     l)
       LAT_WORKER=$OPTARG
       ;;
     h)
      echo "Usage: -W <# workers> -w <write ratio>  (x1000 --> 10 for 1%)"
      echo "       -c <# credits> -b <max batch size> -C <max coalescing>"
      echo "       -M <# nodes>   -l <latency worker> "
      exit 1
      ;;
    \?)
      echo "Invalid option: -$OPTARG use -h to get info for arguments" >&2
      exit 1
      ;;
    :)
      echo "Option -$OPTARG requires an argument." >&2
      exit 1
      ;;
  esac
done

blue "Running hermes threads"

sudo LD_LIBRARY_PATH=/usr/local/lib/ -E \
	./rCRAQ                             \
	--machine-id ${NODE_ID}             \
	--is-roce 0                         \
	--dev-name ${NET_DEVICE_NAME}       \
	--num-machines ${NUM_MACHINES}      \
	--num-workers  ${NUM_WORKERS}       \
	--lat-worker   ${LAT_WORKER}        \
	--rmw-ratio    ${RMW_RATIO}         \
	--write-ratio  ${WRITE_RATIO}       \
	--credits      ${CREDITS}           \
	--max-coalesce ${MAX_COALESCE}      \
	--max-batch-size ${MAX_BATCH_SIZE}  \
	2>&1


================================================
FILE: exec/run.sh
================================================
#!/usr/bin/env bash

source ./hosts.sh

export HRD_REGISTRY_IP="${ALL_IPS[0]}" # I.E. first IP node (HOUSTON) has a memcached server (used to initialize RDMA QPs)
export MLX5_SINGLE_THREADED=1
export MLX5_SCATTER_TO_CQE=1

sudo killall memcached
sudo killall hades
sudo killall rCRAQ
sudo killall hermesKV

# A function to echo in blue color
function blue() {
	es=`tput setaf 4`
	ee=`tput sgr0`
	echo "${es}$1${ee}"
}


#### free the pages workers use
blue "Removing SHM keys used by HermesKV/rCRAQ"
for i in `seq 0 28`; do
	key=`expr 3185 + $i`
	sudo ipcrm -M $key 2>/dev/null
	key=`expr 4185 + $i`
	sudo ipcrm -M $key 2>/dev/null
done
: ${HRD_REGISTRY_IP:?"Need to set HRD_REGISTRY_IP non-empty"}


blue "Reset server QP registry"
memcached -l ${HRD_REGISTRY_IP} 1>/dev/null 2>/dev/null &
sleep 1


================================================
FILE: include/hades/hades.h
================================================
//
// Created by akatsarakis on 17/01/19.
//

#ifndef HADES_H
#define HADES_H

#include "../../include/wings/wings.h"
#include "../utils/bit_vector.h"
#include "../utils/time_rdtsc.h"
// Send heartbeats
// Recv heartbeats
// Change View
// Update local membership

// (Ostracism)
// arbitration --> a node provides an obolus

// all nodes are able to communicate w/ each other

// fd provides a view as a membership change
// only as long as it differs with the current view
// and agrees with a majority of other node views.

// The update granularity of local view works as a lease
// to membership changes which prevents sequentially
// consistent reads in the presence of network partitions
//       I.E. a node in a minority partition is able to detect
//       that cannot reach the majority of nodes and stops serving
//       local reads, maintaining linearizability (instead of sequential
//       consistency) For this

// Epochs

// Guarantees Nodes in the same EPOCH id have the same group view

#define ENABLE_ARBITRATION 1

// Hades debug Tests
#define FAKE_LINK_FAILURE 0
#define FAKE_LINK_FAILURE_AFTER_SEC 15
#define STOP_FAKE_LINK_FAILURE_AFTER_SEC 20
#define FAKE_ONE_WAY_LINK_FAILURE 0
#define FAKE_LINK_FAILURE_NODE_A 2
#define FAKE_LINK_FAILURE_NODE_B 1
static_assert(FAKE_LINK_FAILURE_NODE_A != FAKE_LINK_FAILURE_NODE_B, "");

typedef struct {
  uint8_t node_id : 8;
  uint8_t epoch_id : 8;
  uint8_t same_w_local_membership : 1;
  uint8_t have_ostracised_for_dst_node : 7;
  bit_vector_t view;
} __attribute__((packed)) hades_view_t;
static_assert(sizeof(hades_view_t) <= 4,
              "Currently send using a 4B header only field (RDMA immediate)");

typedef struct {
  hades_view_t last_local_view;
  hades_view_t intermediate_local_view;

  bit_vector_t curr_g_membership;
  uint8_t nodes_in_membership;

  uint8_t max_num_nodes;
  uint8_t* recved_views_flag;
  hades_view_t* remote_recved_views;

  // Polling
  uint16_t max_views_to_poll;
  hades_view_t* poll_buff;  // used for polling remote views

  // Timing
  uint32_t send_view_every_us;
  uint32_t update_local_view_every_ms;
  struct timespec* ts_last_send;  // issues views to remotes iff have not send a
                                  // view within the predefined timeout
  struct timespec
      ts_last_view_change;  // update views and possible changes membership iff
                            // pre-defined timeout is exceed

  // Ostracism
  uint8_t*
      have_ostracized_for;  // an array storing info whether or not in a view
                            // the sender ostracized someone for this node
} hades_ctx_t;

typedef struct {
  hades_ctx_t ctx;
  ud_channel_t* hviews_c;
  ud_channel_t* hviews_crd_c;
} hades_wings_ctx_t;

void* hades_full_thread(void* node_id);
uint16_t poll_for_remote_views(hades_wings_ctx_t* hw_ctx);
void update_view_and_issue_hbs(hades_wings_ctx_t* hw_ctx);

inline static void
hades_ctx_init(hades_ctx_t* ctx, uint8_t node_id, uint8_t max_nodes,
               uint16_t max_views_to_poll, uint32_t send_view_us,
               uint32_t update_local_view_ms)
{
  assert(max_views_to_poll > 0);

  ctx->intermediate_local_view.epoch_id = 0;
  ctx->intermediate_local_view.node_id = node_id;
  ctx->nodes_in_membership = 1;
  bv_init(&ctx->curr_g_membership);
  bv_bit_set(&ctx->curr_g_membership, node_id);
  bv_init(&ctx->intermediate_local_view.view);
  bv_bit_set(&ctx->intermediate_local_view.view, node_id);
  ctx->last_local_view = ctx->intermediate_local_view;

  ctx->max_num_nodes = max_nodes;
  ctx->recved_views_flag = malloc(sizeof(uint8_t) * max_nodes);
  ctx->remote_recved_views = malloc(sizeof(hades_view_t) * max_nodes);
  for (int i = 0; i < max_nodes; ++i) {
    ctx->recved_views_flag[i] = 0;
    bv_init(&ctx->remote_recved_views[i].view);
  }

  ctx->max_views_to_poll = max_views_to_poll;
  ctx->poll_buff = malloc(sizeof(hades_view_t) * max_views_to_poll);

  // Setup timers
  init_rdtsc(1, 0);  /// WARNING: this is not thread safe!!
  get_rdtsc_timespec(&ctx->ts_last_view_change);
  ctx->ts_last_send = malloc(sizeof(struct timespec) * max_nodes);
  for (int i = 0; i < max_nodes; ++i)
    get_rdtsc_timespec(&ctx->ts_last_send[i]);

  ctx->send_view_every_us = send_view_us;
  ctx->update_local_view_every_ms = update_local_view_ms;
  assert(2 * 1000 * update_local_view_ms > send_view_us);

  // Ostracism
  ctx->have_ostracized_for = malloc(sizeof(uint8_t) * max_nodes);
  for (int i = 0; i < max_nodes; ++i)
    ctx->have_ostracized_for[i] = 0;
}

// WARNING: hades wings_ctx_init initializes only the first part of the
// required channels wings_setup_channel_qps_and_recvs must be called by
// the application afterwards to finish the initialization of wings.
inline static void
hades_wings_ctx_init(hades_wings_ctx_t* wctx, uint8_t node_id,
                     uint8_t max_nodes, uint16_t max_views_to_poll,
                     uint32_t send_view_us, uint32_t update_local_view_ms,
                     ud_channel_t* hviews_c, ud_channel_t* hviews_crd_c,
                     uint16_t worker_lid)
{
  hades_ctx_init(&wctx->ctx, node_id, max_nodes, max_views_to_poll,
                 send_view_us, update_local_view_ms);

  wctx->hviews_c = hviews_c;
  wctx->hviews_crd_c = hviews_crd_c;

  const uint8_t is_bcast = 0;
  const uint8_t stats_on = 1;
  const uint8_t prints_on = 1;
  const uint8_t is_hdr_only = 1;
  const uint8_t expl_crd_ctrl = 1;
  const uint8_t enable_inlining = 1;
  const uint8_t disable_crd_ctrl = 0;
  const uint8_t credits =
      (const uint8_t)(2 * update_local_view_ms * 1000 / send_view_us);

  char qp_name[200];
  sprintf(qp_name, "%s%d", "\033[1m\033[32mHades\033[0m", worker_lid);

  wings_ud_channel_init(
      wctx->hviews_c, qp_name, REQ, 1, sizeof(hades_view_t) - sizeof(uint8_t),
      0, enable_inlining, is_hdr_only, is_bcast, disable_crd_ctrl,
      expl_crd_ctrl, wctx->hviews_crd_c, credits, max_nodes,
      (uint8_t)machine_id, stats_on, prints_on);
}

// How does somebody joins?
// epoch id 0
// must see at least a majority of views with same epoch id > 0
// || majority of views with epoch id 0
#endif  // HADES_H


================================================
FILE: include/hermes/config.h
================================================
//
// Created by akatsarakis on 15/03/18.
//

#ifndef SPACETIME_CONFIG_H
#define SPACETIME_CONFIG_H
#include <assert.h>
#include <stdint.h>
#include "sizes.h"

// MAX_ defines are treated as DEFAULT_ as well (i.e., if not altered by CLI
// args)

/*-------------------------------------------------
------------ SETUP & DEFAULT SETTINGS -------------
--------------------------------------------------*/
#define MAX_MACHINE_NUM 5           // maximum nodes
#define MAX_WORKERS_PER_MACHINE 15  // maximum number of threads per node
#define DEFAULT_WORKERS_PER_MACHINE 2
#define DEFAULT_THREAD_OF_STAT_THREAD \
  (15)  // WARNING make sure this is not co-located with a worker thread

// Number of sockets (numa nodes), cores and h/w threads per core on each node
#define TOTAL_THREADS_PER_CORE 2
#define TOTAL_CORES_PER_SOCKET 10
#define TOTAL_NUMBER_OF_SOCKETS 2

/*-------------------------------------------------
-------------------------------------------------
-------------------------------------------------
-------- No need to change beyond this point ----
-------------------------------------------------
-------------------------------------------------
--------------------------------------------------*/

// Default workload writes / updates accesses (the rest are reads)
#define DEFAULT_UPDATE_RATIO 1000  // is divided by 10 (i.e., 25 --> 2.5 %)
// both writes and RMWs (RMW_RATIO inderectly provides WRITE_RATIO)

#define ENABLE_RMWs \
  0  // if RMWs is not enabled then all UPDATE_RATIO == WRITE_RATIO
#define DEFAULT_RMW_RATIO 0  // is divided by 10 (i.e., 25 --> 2.5 %)
// percentage of UPDATE_RATIO to be RMWs

// Max operations per-thread to batches to the KVS (either received packets or
// read/write/RMW requests)
#define MAX_BATCH_KVS_OPS_SIZE 250
static_assert(MAX_WORKERS_PER_MACHINE <= 254, "");
static_assert(MAX_WORKERS_PER_MACHINE <= TOTAL_NUMBER_OF_SOCKETS *
                                             TOTAL_THREADS_PER_CORE *
                                             TOTAL_CORES_PER_SOCKET,
              "");
static_assert(DEFAULT_UPDATE_RATIO <= 1000 && DEFAULT_RMW_RATIO >= 0, "");

/*-------------------------------------------------
----------------- RDMA SETTINGS -------------------
--------------------------------------------------*/
// Request coalescing (max --readily available-- messages to batch in a single
// RDMA packet)
#define MAX_REQ_COALESCE 15

// Flow control
#define MAX_CREDITS_PER_REMOTE_WORKER (MAX_REQ_COALESCE)

// Request inlining
#define DISABLE_INLINING 0

/*-------------------------------------------------
----------------- SECONDARY SETTINGS --------------
--------------------------------------------------*/
// LATENCY
#define DEFAULT_MEASURE_LATENCY 0
#define DEFAULT_WORKER_MEASURING_LATENCY 0
#define MAX_LATENCY 1000  // in us
#define LATENCY_BUCKETS 1000
#define LATENCY_PRECISION \
  (MAX_LATENCY / LATENCY_BUCKETS)  // latency granularity in us

// FAIRNESS
#define ENABLE_VIRTUAL_NODE_IDS 0  // 0
#define VIRTUAL_NODE_IDS_PER_NODE 20

// SKEW
#define ENABLE_COALESCE_OF_HOT_REQS \
  0  // 0 //WARNING!!! this must be disabled for cr
#define COALESCE_N_HOTTEST_KEYS 100
#define ENABLE_READ_COMPLETE_AFTER_VAL_RECV_OF_HOT_REQS 0  // 1
#define ENABLE_WRITE_COALESCE_TO_THE_SAME_KEY_IN_SAME_NODE 0

// DEBUG
#define ENABLE_ASSERTIONS 0
#define DISABLE_VALS_FOR_DEBUGGING 0
#define KEY_NUM 0  // use 0 to disable

// REQUESTS
#define FEED_FROM_TRACE 0
#define ZIPF_EXPONENT_OF_TRACE \
  99  // if FEED_FROM_TRACE == 1 | this is divided by 100 (e.g. use 99 for  a =
      // 0.99)
#define NUM_OF_REP_REQS K_256  // if FEED_FROM_TRACE == 0
#define USE_A_SINGLE_KEY 0     // if FEED_FROM_TRACE == 0
#define ST_KEY_ID_255_OR_HIGHER 255

/*-------------------------------------------------
---------------- Debug and others -----------------
--------------------------------------------------*/
// DBG Prints
/// Warning some prints assume that there are no faults (multiplications with
/// REMOTE_MACHINES)
#define MAX_THREADS_TO_PRINT 1
#define ENABLE_REQ_PRINTS 0
#define ENABLE_BATCH_OP_PRINTS 0
#define ENABLE_INV_PRINTS 0
#define ENABLE_ACK_PRINTS 0
#define ENABLE_VAL_PRINTS 0

// Stats prints
#define PRINT_STATS_EVERY_MSECS 4000  // 5000 //10000 //10
#define PRINT_WORKER_STATS 0

// Stats
#define EXIT_ON_STATS_PRINT 1
#define PRINT_NUM_STATS_BEFORE_EXITING 5
#define DUMP_XPUT_STATS_TO_FILE 1

// FAILURE DETECTION (RM)
#define ENABLE_HADES_FAILURE_DETECTION 0
#define WORKER_WITH_FAILURE_DETECTOR 0
static_assert(ENABLE_HADES_FAILURE_DETECTION == 0,
              "WARNING HADES is currently not working");

// FAKE NODE FAILURE
#define FAKE_FAILURE 0
#define NODE_TO_FAIL 2
#define ROUNDS_BEFORE_FAILURE 2

// Rarely (or never) change
#define BASE_SHM_KEY 24
#define WORKER_SL 0  // service level for the workers
#define MAX_REMOTE_MACHINES (MAX_MACHINE_NUM - 1)
#define HERMES_CEILING(x, y) (((x) + (y)-1) / (y))
#define GROUP_MEMBERSHIP_ARRAY_SIZE \
  HERMES_CEILING(MAX_MACHINE_NUM, 8)  // assuming uint8_t
#define TOTAL_HW_CORES \
  (TOTAL_THREADS_PER_CORE * TOTAL_CORES_PER_SOCKET * TOTAL_NUMBER_OF_SOCKETS)
static_assert(MAX_WORKERS_PER_MACHINE < TOTAL_HW_CORES - 1,
              "Leave at least a hw thread free for OS etc..");

#define KV_SOCKET 0  // socket to allocate KVS (huge-)pages
#define USE_ALL_SOCKETS 1
#define ENABLE_HYPERTHREADING 1
#define SOCKET_TO_START_SPAWNING_THREADS 0

// Debug
//#define SPACETIME DEBUG 2
#ifndef SPACETIME_DEBUG
#define SPACETIME_DEBUG 0
#endif

////////////////////////////////
/// Hermes NOT TUNABLE
////////////////////////////////
/*-------------------------------------------------
----------------- MAX HERMES OPS SIZE -------------
--------------------------------------------------*/
#define MAX_MSG_RECV_OPS_SIZE \
  (MAX_CREDITS_PER_REMOTE_WORKER * MAX_REMOTE_MACHINES * MAX_REQ_COALESCE)
#define HERMES_MAX_BATCH_SIZE MAX(MAX_MSG_RECV_OPS_SIZE, MAX_BATCH_KVS_OPS_SIZE)

/*-------------------------------------------------
---------------- QPs Numbers ----------------------
--------------------------------------------------*/
typedef enum {
  INV_UD_QP_ID = 0,
  ACK_UD_QP_ID,
  VAL_UD_QP_ID,
  CRD_UD_QP_ID,
  END_HERMES_QPS_ENUM
} hermes_qps_enum;
// QPs
#define TOTAL_WORKER_UD_QPs END_HERMES_QPS_ENUM
#define TOTAL_WORKER_N_FAILURE_DETECTION_UD_QPs \
  (TOTAL_WORKER_UD_QPs + (ENABLE_HADES_FAILURE_DETECTION ? 2 : 0))

/*-------------------------------------------------
----------------- CR CONFIGURATION ----------------
--------------------------------------------------*/
#define CR_ENABLE_REMOTE_READS 0
#define CR_REMOTE_READS_CREDITS 20

#define MAX_CREDITS_PER_REMOTE_WORKER_CR 250  //(MAX_BATCH_KVS_OPS_SIZE) // CR

#define CR_ACK_CREDITS (255)  // //(MAX_MACHINE_NUM * 255)

#define CR_ENABLE_EARLY_INV_CRDS \
  1  // optimization to increase request pipelining

typedef enum {
  CR_INV_UD_QP_ID = 0,
#ifdef CR_ENABLE_EARLY_INV_CRDS
  CR_INV_CRD_UD_QP_ID,
#endif
  CR_ACK_UD_QP_ID,
  CR_REMOTE_WRITES_UD_QP_ID,
  CR_REMOTE_WRITE_CRD_UD_QP_ID,
  CR_REMOTE_READS_UD_QP_ID,
  CR_REMOTE_READS_RESP_UD_QP_ID
} cr_qps_enum;

#define CR_TOTAL_WORKER_UD_QPs                              \
  (TOTAL_WORKER_UD_QPs + (CR_ENABLE_REMOTE_READS ? 2 : 0) + \
   (CR_ENABLE_EARLY_INV_CRDS ? 1 : 0))

// Max CR batch op size
#define MAX_MSG_RECV_OPS_SIZE_CR \
  (MAX_REQ_COALESCE * MAX_CREDITS_PER_REMOTE_WORKER_CR * MAX_REMOTE_MACHINES)
#define CR_MAX_BATCH_SIZE MAX(MAX_MSG_RECV_OPS_SIZE_CR, MAX_BATCH_KVS_OPS_SIZE)

// CR DEBUG
#define CR_ENABLE_ONLY_HEAD_REQS 0
#define CR_ENABLE_ALL_NODES_GETS_EXCEPT_HEAD 0
#define CR_ENABLE_BLOCKING_INVALID_WRITES_ON_HEAD 0

/*-------------------------------------------------
----------------- Global Vars ---------------------
--------------------------------------------------*/

struct thread_params {
  int id;
};

struct latency_counters {
  uint32_t read_reqs[LATENCY_BUCKETS + 1];
  uint32_t write_reqs[LATENCY_BUCKETS + 1];
  int max_read_latency;
  int max_write_latency;
  long long total_measurements;
};

extern struct latency_counters latency_count;

// global config (CLI) configurable vars
extern uint8_t is_CR;
extern int update_ratio;
extern int rmw_ratio;
extern int num_workers;
extern int credits_num;
extern int max_coalesce;
extern int max_batch_size;  // for batches to KVS

extern int machine_num;         // must be smaller or equal to MAX_MACHINE_NUM
extern int remote_machine_num;  // must be smaller or equal to MAX_MACHINE_NUM
extern int worker_measuring_latency;

// extern int value_size; // must be smaller or equal to MAX_MACHINE_NUM

#endif  // SPACETIME_CONFIG_H


================================================
FILE: include/hermes/inline-util.h
================================================
//
// Created by akatsarakis on 23/05/18.
//

#ifndef HERMES_INLINE_UTIL_H
#define HERMES_INLINE_UTIL_H

#include <infiniband/verbs.h>
#include "../hades/hades.h"
#include "../utils/concur_ctrl.h"
#include "config.h"
#include "spacetime.h"
#include "util.h"

/* ---------------------------------------------------------------------------
----------------------------------- MEMBERSHIP -------------------------------
---------------------------------------------------------------------------*/

static inline uint8_t
node_is_in_membership(spacetime_group_membership last_group_membership,
                      int node_id)
{
  return (uint8_t)(bv_bit_get(last_group_membership.g_membership,
                              (uint8_t)node_id) == 1
                       ? 1
                       : 0);
}

static inline void
group_membership_update(hades_ctx_t hades_ctx)
{
  seqlock_lock(&group_membership.lock);

  bv_copy((bit_vector_t*)&group_membership.g_membership,
          hades_ctx.curr_g_membership);
  bv_copy((bit_vector_t*)&group_membership.w_ack_init,
          group_membership.g_membership);
  bv_reverse((bit_vector_t*)&group_membership.w_ack_init);
  bv_bit_set((bit_vector_t*)&group_membership.w_ack_init, (uint8_t)machine_id);

  group_membership.num_of_alive_remotes =
      bv_no_setted_bits(group_membership.g_membership);
  seqlock_unlock(&group_membership.lock);

  if (group_membership.num_of_alive_remotes < (machine_num / 2)) {
    colored_printf(RED, "Majority is down!\n");
    exit(-1);
  }
}

static inline uint8_t
group_membership_has_changed(spacetime_group_membership* last_group_membership,
                             uint16_t worker_lid)
{
  uint32_t debug_lock_free_membership_read_cntr = 0;
  spacetime_group_membership lock_free_read_group_membership;

  do {  // Lock free read of group membership
    if (ENABLE_ASSERTIONS) {
      debug_lock_free_membership_read_cntr++;
      if (debug_lock_free_membership_read_cntr == M_4) {
        printf("Worker %u stuck on a lock-free read (for group membership)\n",
               worker_lid);
        debug_lock_free_membership_read_cntr = 0;
      }
    }
    lock_free_read_group_membership =
        *((spacetime_group_membership*)&group_membership);
  } while (!(seqlock_version_is_same_and_valid(
      &group_membership.lock, &lock_free_read_group_membership.lock)));
  for (int i = 0; i < GROUP_MEMBERSHIP_ARRAY_SIZE; i++)
    if (!bv_are_equal(lock_free_read_group_membership.g_membership,
                      last_group_membership->g_membership)) {
      *last_group_membership = lock_free_read_group_membership;
      return 1;
    }
  return 0;
}

/* ---------------------------------------------------------------------------
----------------------------------- LATENCY -------------------------------
---------------------------------------------------------------------------*/
// Add latency to histogram (in microseconds)
static inline void
bookkeep_latency(int useconds, uint8_t op)
{
  uint32_t* latency_array;
  int* max_latency_ptr;
  switch (op) {
    case ST_OP_PUT:
      latency_array = latency_count.write_reqs;
      max_latency_ptr = &latency_count.max_write_latency;
      break;
    case ST_OP_GET:
      latency_array = latency_count.read_reqs;
      max_latency_ptr = &latency_count.max_read_latency;
      break;
    default:
      assert(0);
  }
  latency_count.total_measurements++;
  if (useconds > MAX_LATENCY)
    latency_array[LATENCY_BUCKETS]++;
  else
    latency_array[useconds / LATENCY_PRECISION]++;

  if (*max_latency_ptr < useconds) *max_latency_ptr = useconds;
}

// Necessary bookkeeping to initiate the latency measurement
static inline void
start_latency_measurement(struct timespec* start)
{
  clock_gettime(CLOCK_MONOTONIC, start);
}

static inline void
stop_latency_measurment(uint8_t req_opcode, struct timespec* start)
{
  struct timespec end;
  clock_gettime(CLOCK_MONOTONIC, &end);
  int useconds = (int)(((end.tv_sec - start->tv_sec) * 1000000) +
                       ((end.tv_nsec - start->tv_nsec) / 1000));
  if (ENABLE_ASSERTIONS) assert(useconds >= 0);
  //	printf("Latency of %s %u us\n", code_to_str(req_opcode), useconds);
  bookkeep_latency(useconds, req_opcode);
}

static inline void
stop_latency_of_completed_writes(spacetime_op_t* ops, uint16_t worker_lid,
                                 struct timespec* stopwatch)
{
  if (machine_id == 0 && worker_lid == worker_measuring_latency)
    if (ops[0].op_meta.opcode == ST_OP_PUT &&
        (ops[0].op_meta.state == ST_MISS ||
         ops[0].op_meta.state == ST_PUT_COMPLETE))
      stop_latency_measurment(ops[0].op_meta.opcode, stopwatch);
}

static inline void
stop_latency_of_completed_reads(spacetime_op_t* ops, uint16_t worker_lid,
                                struct timespec* stopwatch)
{
  if (machine_id == 0 && worker_lid == worker_measuring_latency)
    if (ops[0].op_meta.opcode == ST_OP_GET &&
        (ops[0].op_meta.state == ST_MISS ||
         ops[0].op_meta.state == ST_GET_COMPLETE))
      stop_latency_measurment(ops[0].op_meta.opcode, stopwatch);
}

/* ---------------------------------------------------------------------------
---------------------------------- Refill Requests ---------------------------
---------------------------------------------------------------------------*/
static inline int
refill_ops(uint32_t* trace_iter, uint16_t worker_lid,
           struct spacetime_trace_command* trace, spacetime_op_t* ops,
           uint32_t* refilled_per_ops_debug_cnt, struct timespec* start,
           spacetime_op_t** n_hottest_keys_in_ops_get,
           spacetime_op_t** n_hottest_keys_in_ops_put)
{
  static uint8_t first_iter_has_passed[MAX_WORKERS_PER_MACHINE] = {0};

  int refilled_ops = 0, node_suspected = -1;
  for (int i = 0; i < max_batch_size; i++) {
    if (ENABLE_ASSERTIONS && first_iter_has_passed[worker_lid] == 1) {
      assert(ops[i].op_meta.opcode == ST_OP_PUT ||
             ops[i].op_meta.opcode == ST_OP_GET ||
             (is_CR == 0 && ops[i].op_meta.opcode == ST_OP_RMW));
      assert(ops[i].op_meta.state == ST_PUT_COMPLETE ||
             ops[i].op_meta.state == ST_GET_COMPLETE ||
             ops[i].op_meta.state == ST_PUT_SUCCESS ||
             ops[i].op_meta.state == ST_REPLAY_SUCCESS ||
             ops[i].op_meta.state == ST_NEW ||
             ops[i].op_meta.state == ST_MISS ||
             ops[i].op_meta.state == ST_PUT_STALL ||
             ops[i].op_meta.state == ST_REPLAY_COMPLETE ||
             ops[i].op_meta.state == ST_IN_PROGRESS_PUT ||
             //<RMW>
             ops[i].op_meta.state == ST_RMW_STALL ||
             ops[i].op_meta.state == ST_RMW_ABORT ||
             ops[i].op_meta.state == ST_RMW_SUCCESS ||
             ops[i].op_meta.state == ST_RMW_COMPLETE ||
             ops[i].op_meta.state == ST_IN_PROGRESS_RMW ||
             //					   ops[i].op_meta.state ==
             // ST_IN_PROGRESS_PUT
             //|| <RMW>
             ops[i].op_meta.state == ST_IN_PROGRESS_GET ||
             ops[i].op_meta.state == ST_IN_PROGRESS_REPLAY ||
             ops[i].op_meta.state ==
                 ST_OP_MEMBERSHIP_CHANGE ||  /// TODO check this
             ops[i].op_meta.state ==
                 ST_OP_MEMBERSHIP_COMPLETE ||  /// TODO check this
             ops[i].op_meta.state == ST_PUT_COMPLETE_SEND_VALS ||
             ops[i].op_meta.state == ST_GET_STALL);
    }

    if (first_iter_has_passed[worker_lid] == 0 ||
        ops[i].op_meta.state == ST_MISS ||
        ops[i].op_meta.state == ST_PUT_COMPLETE ||
        ops[i].op_meta.state == ST_RMW_ABORT ||
        ops[i].op_meta.state == ST_RMW_COMPLETE ||
        ops[i].op_meta.state == ST_OP_MEMBERSHIP_COMPLETE ||
        ops[i].op_meta.state == ST_GET_COMPLETE) {
      if (first_iter_has_passed[worker_lid] != 0) {
        if (ENABLE_REQ_PRINTS && worker_lid < MAX_THREADS_TO_PRINT)
          colored_printf(
              GREEN,
              "W%d--> Key Hash:%" PRIu64
              "\n\t\tType: %s, version %d, tie-b: %d, value(len-%d): %c\n",
              worker_lid, ((uint64_t*)&ops[i].op_meta.key)[0],
              code_to_str(ops[i].op_meta.state), ops[i].op_meta.ts.version,
              ops[i].op_meta.ts.tie_breaker_id, ops[i].op_meta.val_len,
              ops[i].value[0]);

        /// Stats
        if (ops[i].op_meta.state != ST_MISS) {
          if (ops[i].op_meta.state != ST_RMW_ABORT)
            w_stats[worker_lid].completed_ops_per_worker +=
                ENABLE_COALESCE_OF_HOT_REQS ? ops[i].no_coales : 1;
        } else
          w_stats[worker_lid].reqs_missed_in_kvs++;

        if (ops[i].op_meta.state == ST_PUT_COMPLETE)
          w_stats[worker_lid].completed_wrs_per_worker++;
        else if (ops[i].op_meta.state == ST_RMW_COMPLETE)
          w_stats[worker_lid].completed_rmws_per_worker++;
        else if (ops[i].op_meta.state == ST_RMW_ABORT)
          w_stats[worker_lid].aborted_rmws_per_worker++;

        // reset op bucket
        ops[i].no_coales = 1;
        ops[i].op_meta.state = ST_EMPTY;
        ops[i].op_meta.opcode = ST_EMPTY;
        refilled_per_ops_debug_cnt[i] = 0;
        refilled_ops++;
      }

      if (ENABLE_ASSERTIONS)
        assert(trace[*trace_iter].opcode == ST_OP_PUT ||
               trace[*trace_iter].opcode == ST_OP_RMW ||
               trace[*trace_iter].opcode == ST_OP_GET);

      if (machine_id == 0 && worker_lid == worker_measuring_latency && i == 0)
        start_latency_measurement(start);

      /// INSERT new req(s) to ops
      uint8_t key_id;
      if (ENABLE_COALESCE_OF_HOT_REQS &&
          trace[*trace_iter].opcode != ST_OP_RMW) {
        // see if you could coalesce any requests
        spacetime_op_t** n_hottest_keys_in_ops;
        do {
          key_id = trace[*trace_iter].key_id;
          n_hottest_keys_in_ops = trace[*trace_iter].opcode == ST_OP_GET
                                      ? n_hottest_keys_in_ops_get
                                      : n_hottest_keys_in_ops_put;
          // if we can coalesce (a hot) req
          if (key_id < COALESCE_N_HOTTEST_KEYS &&  // is a hot key
              n_hottest_keys_in_ops[key_id] !=
                  NULL &&  // exists in the ops array
              n_hottest_keys_in_ops[key_id]->op_meta.opcode ==
                  trace[*trace_iter]
                      .opcode)  // has the same code with the last inserted
          {
            n_hottest_keys_in_ops[key_id]->no_coales++;
            *trace_iter =
                trace[*trace_iter + 1].opcode != NOP ? *trace_iter + 1 : 0;
          } else
            break;
        } while (1);

        if (key_id < COALESCE_N_HOTTEST_KEYS)
          n_hottest_keys_in_ops[key_id] = &ops[i];
      }

      ops[i].op_meta.state = ST_NEW;
      ops[i].op_meta.opcode =
          (uint8_t)(CR_ENABLE_ALL_NODES_GETS_EXCEPT_HEAD && machine_id != 0
                        ? ST_OP_GET
                        : trace[*trace_iter].opcode);
      memcpy(&ops[i].op_meta.key, &trace[*trace_iter].key_hash,
             sizeof(spacetime_key_t));

      if (ops[i].op_meta.opcode == ST_OP_PUT ||
          ops[i].op_meta.opcode == ST_OP_RMW)
        memset(ops[i].value, ((uint8_t)'a' + machine_id), ST_VALUE_SIZE);

      else if (ENABLE_READ_COMPLETE_AFTER_VAL_RECV_OF_HOT_REQS) {
        // if its a read reset the timestamp
        ops[i].op_meta.ts.version = 0;
        ops[i].op_meta.ts.tie_breaker_id = 0;
      }

      ops[i].RMW_flag = ops[i].op_meta.opcode == ST_OP_RMW ? 1 : 0;

      ops[i].op_meta.val_len = (uint8)(ops[i].op_meta.opcode == ST_OP_GET
                                           ? 0
                                           : ST_VALUE_SIZE >> SHIFT_BITS);

      // instead of MOD add
      *trace_iter = trace[*trace_iter + 1].opcode != NOP ? *trace_iter + 1 : 0;

      if (ENABLE_REQ_PRINTS && worker_lid < MAX_THREADS_TO_PRINT)
        colored_printf(RED, "W%d--> Op: %s, hash(1st 8B):%" PRIu64 "\n",
                       worker_lid, code_to_str(ops[i].op_meta.opcode),
                       ((uint64_t*)&ops[i].op_meta.key)[0]);

    } else
      refilled_per_ops_debug_cnt[i]++;
  }

  if (refilled_ops == 0) w_stats[worker_lid].wasted_loops++;

  if (first_iter_has_passed[worker_lid] == 0)
    first_iter_has_passed[worker_lid] = 1;

  if (ENABLE_ASSERTIONS)
    for (int i = 0; i < max_batch_size; i++)
      assert(ops[i].op_meta.opcode == ST_OP_PUT ||
             ops[i].op_meta.opcode == ST_OP_GET ||
             (ops[i].op_meta.opcode == ST_OP_RMW && is_CR == 0));

  return node_suspected;
}
#endif  // HERMES_INLINE_UTIL_H


================================================
FILE: include/hermes/spacetime.h
================================================
//
// Created by akatsarakis on 04/05/18.
//

#ifndef HERMES_SPACETIME_H
#define HERMES_SPACETIME_H

// Optik Options
#ifndef CORE_NUM
#define DEFAULT
#define CORE_NUM 8
#endif

#include "../utils/bit_vector.h"
#include "../utils/concur_ctrl.h"
#include "config.h"
#include "hrd.h"
#include "mica.h"

#define SPACETIME_NUM_KEYS (1000 * 1000)
#define SPACETIME_NUM_BKTS (2 * 1024 * 1024)
#define SPACETIME_LOG_CAP (1024 * 1024 * 1024)

//#define SPACETIME_NUM_KEYS (60 * 1000 * 1000)
//#define SPACETIME_NUM_BKTS (64 * 1024 * 1024)
//#define SPACETIME_LOG_CAP  (4 * ((unsigned long long) M_1024)) //(1024 * 1024
//* 1024)

#define ST_VALUE_SIZE (KVS_VALUE_SIZE - sizeof(spacetime_object_meta))

// Special EMPTY opcodes
#define NOP 150                   // trace
#define LAST_WRITER_ID_EMPTY 127  // 255
#define ST_OP_BUFFER_INDEX_EMPTY 255

/////////////////////////////////////////////
//// ENUMS
/////////////////////////////////////////////
/// WARNING the monotonically increasing assigned numbers to States are used for
/// comparisons (do not reorder / change numbers)
// States
typedef enum {
  VALID_STATE = 1,
  INVALID_STATE,
  INVALID_WRITE_STATE,
  WRITE_STATE,
  REPLAY_STATE,
} __attribute__((packed)) hermes_states_t;

// Input Opcodes
typedef enum {
  ST_OP_GET = 111,
  ST_OP_PUT,
  ST_OP_RMW,
  ST_OP_INV,
  ST_OP_ACK,
  ST_OP_VAL,
  ST_OP_CRD,
  ST_OP_MEMBERSHIP_CHANGE,
  ST_OP_MEMBERSHIP_COMPLETE  // 119

} __attribute__((packed)) input_opcodes_t;

// Response Opcodes
typedef enum {
  ST_GET_COMPLETE = 121,
  ST_PUT_SUCCESS,     // broadcast invs
  ST_REPLAY_SUCCESS,  // broadcast invs
  ST_INV_SUCCESS,     // send ack
  ST_ACK_SUCCESS,
  ST_LAST_ACK_SUCCESS,           // complete local write
  ST_LAST_ACK_NO_BCAST_SUCCESS,  // complete local write
  ST_PUT_COMPLETE,               // broadcast invs
  ST_VAL_SUCCESS,                // 129

  ST_MISS,  // 130
  ST_GET_STALL,
  ST_PUT_STALL,
  ST_PUT_COMPLETE_SEND_VALS,
  ST_SEND_CRD,  // 134

  // RMW opcodes
  ST_RMW_SUCCESS,  // 135
  ST_RMW_STALL,
  ST_RMW_COMPLETE,
  ST_RMW_ABORT,
  ST_OP_INV_ABORT,  // 139 //send inv instead of ACK

} __attribute__((packed)) response_opcodes_t;

// ops bucket states
typedef enum {
  ST_EMPTY = 140,
  ST_NEW,
  ST_COMPLETE,
  ST_IN_PROGRESS_PUT,
  ST_IN_PROGRESS_REPLAY,
  ST_REPLAY_COMPLETE,
  ST_IN_PROGRESS_GET,  // Used only in Chain Replication
  ST_REPLAY_COMPLETE_SEND_VALS,
  ST_IN_PROGRESS_RMW,
  ST_RMW_COMPLETE_SEND_VALS  // 149
} __attribute__((packed)) op_bucket_states_t;

// failure detection (deprecated)
typedef enum {
  ST_OP_HEARTBEAT = 151,  // WARNING: 150 opcode is used (see NOP define)!!
  ST_OP_SUSPICION,
  ST_INV_OUT_OF_GROUP
} __attribute__((packed)) fs_ops_t;

// receive_buff_types
typedef enum {
  ST_INV_BUFF = 161,
  ST_ACK_BUFF,
  ST_VAL_BUFF,
  ST_CRD_BUFF
} __attribute__((packed)) rcv_buff_types_t;

/////////////////////////////////////////////
//// Hermes(msg and KV -- spacetime) structs
/////////////////////////////////////////////

// Fixed-size 8 (or 16) byte keys
typedef struct {
  //    uint64 __unused; // This should be 8B ////// Uncomment this for
  //    fixed-size 16 byte keys instead
  uint64_t bkt : 48;
  unsigned int tag : 16;
} spacetime_key_t;

typedef volatile struct {
  hermes_states_t state;
  bit_vector_t ack_bv;
  uint8_t RMW_flag : 1;
  uint8_t last_writer_id : 7;
  uint8_t op_buffer_index;  // TODO change to uint16_t for a buffer >= 256
  conc_ctrl_t cctrl;
  timestamp_t last_local_write_ts;
} spacetime_object_meta;

typedef struct {
  spacetime_key_t key; /* This must be the 1st field and 8B or 16B aligned */
  uint8_t opcode;      // both recv / resp //TODO create a union
  union {
    uint8_t state;      // HERMES:  used by spacetime_op_t
    uint8_t sender;     // HERMES:  used by spacetime_inv/ack/val_t
    uint8_t initiator;  // CR:  used by spacetime_inv/ack
  };
  union {
    uint8_t val_len;   // HERMES: unused for spacetime_ack_t and spacetime_val_t
                       // (align for using a single memcpy)
    uint8_t buff_idx;  //    CR: used   for spacetime_ack_t buffer index of
                       //    write initiated this req
  };
  timestamp_t ts;
} spacetime_op_meta_t, spacetime_ack_t, spacetime_val_t;

typedef struct {
  spacetime_op_meta_t op_meta;  // op_t/inv_t: uses the state/sender part of the
                                // op_meta union (not sender/state)
  union {
    struct {                    // Hermes struct
      uint8_t RMW_flag : 1;     // 1 indicates RMWs while 0 normal writes
      uint16_t no_coales : 15;  // used only for skew optimizations
    };
    struct {              // CR struct
      uint8_t buff_idx;   //    for spacetime_inv_t buffer index of write
                          //    initiated this req
      uint8_t initiator;  //    for spacetime_inv_t buffer index of write
                          //    initiated this req
    };
  };
  uint8_t value[ST_VALUE_SIZE];
} spacetime_op_t, spacetime_inv_t;

typedef struct {
  volatile uint8_t num_of_alive_remotes;
  volatile bit_vector_t g_membership;
  volatile bit_vector_t w_ack_init;
  seqlock_t lock;
} spacetime_group_membership;

struct spacetime_kv {
  // TODO may add kvs stats
  struct mica_kv hash_table;
};

struct spacetime_trace_command {
  spacetime_key_t key_hash;
  uint8_t opcode;
  uint8_t key_id;  // stores key ids 0-254 otherwise it is set to 255 to
                   // indicate other key ids
};

void spacetime_init(int spacetime_id);
void spacetime_populate_fixed_len(struct spacetime_kv* kv, int n, int val_len);

///////////////////////////////////////
//////////////////// Hermes
///////////////////////////////////////

enum hermes_batch_type_t {
  local_ops,
  local_ops_after_membership_change,
  invs,
  acks,
  vals
};

void hermes_batch_ops_to_KVS(enum hermes_batch_type_t type, uint8_t* op_array,
                             int op_num, uint16_t sizeof_op_elem,
                             spacetime_group_membership curr_membership,
                             int* node_suspected,
                             spacetime_op_t* read_write_ops, uint8_t thread_id);

///////////////////////////////////////
//////////////////// CR(AQ)
///////////////////////////////////////
enum cr_type_t {
  Local_ops,      // All nodes
  Remote_writes,  // Head
  Remote_reads,   // Tail
  Invs,           // All except Head
  Acks            // All except Tail
};

void cr_batch_ops_to_KVS(enum cr_type_t cr_type, uint8_t* op_array, int op_num,
                         uint16_t sizeof_op_elem,
                         spacetime_op_t* read_write_op);

///////////////////////////////////////
//////////////////// Helpers
///////////////////////////////////////
static inline uint8_t
is_last_ack(bit_vector_t gathered_acks,
            spacetime_group_membership curr_g_membership)
{
  bv_and(&gathered_acks, curr_g_membership.g_membership);
  return bv_are_equal(gathered_acks, curr_g_membership.g_membership);
}

// TODO: adapt and use the following functions to re-enable variable length
// object support
static inline uint8_t
get_val_len(struct mica_op* op_t)
{
  return (op_t->val_len >> SHIFT_BITS) - sizeof(spacetime_op_meta_t);
}

static inline uint8_t
set_val_len(spacetime_op_meta_t* op_t)
{
  return (op_t->val_len >> SHIFT_BITS) + sizeof(spacetime_op_meta_t);
}

extern struct spacetime_kv kv;
extern spacetime_group_membership group_membership;

#endif  // HERMES_SPACETIME_H


================================================
FILE: include/hermes/util.h
================================================
//
// Created by akatsarakis on 15/03/18.
//

#ifndef HERMES_UTIL_H
#define HERMES_UTIL_H

#include <stdint.h>
#include <stdio.h>
#include <time.h>
#include "config.h"
#include "hrd.h"
#include "spacetime.h"

struct worker_stats {
  long long completed_ops_per_worker;
  long long completed_wrs_per_worker;
  long long completed_rmws_per_worker;
  long long aborted_rmws_per_worker;
  long long reqs_missed_in_kvs;

  long long issued_invs_per_worker;
  long long issued_acks_per_worker;
  long long issued_vals_per_worker;
  long long issued_crds_per_worker;

  long long issued_packet_invs_per_worker;
  long long issued_packet_acks_per_worker;
  long long issued_packet_vals_per_worker;
  long long issued_packet_crds_per_worker;

  long long inv_ss_completions_per_worker;
  long long ack_ss_completions_per_worker;
  long long val_ss_completions_per_worker;
  long long crd_ss_completions_per_worker;

  long long received_invs_per_worker;
  long long received_acks_per_worker;
  long long received_vals_per_worker;
  long long received_crds_per_worker;

  long long received_packet_invs_per_worker;
  long long received_packet_acks_per_worker;
  long long received_packet_vals_per_worker;
  long long received_packet_crds_per_worker;

  long long received_acks_stalled;  // for faking tail-latency

  long long stalled_time_per_worker;

  long long wasted_loops;
  long long total_loops;
  double empty_reqs_per_trace;
  long long cold_keys_per_trace;
  double tot_empty_reqs_per_trace;
};

struct stats {
  double xput_per_worker[MAX_WORKERS_PER_MACHINE];
  double rmw_xput_per_worker[MAX_WORKERS_PER_MACHINE];
  double rmw_abort_rate_per_worker[MAX_WORKERS_PER_MACHINE];

  double issued_invs_avg_coalesing[MAX_WORKERS_PER_MACHINE];
  double issued_acks_avg_coalesing[MAX_WORKERS_PER_MACHINE];
  double issued_vals_avg_coalesing[MAX_WORKERS_PER_MACHINE];
  double issued_crds_avg_coalesing[MAX_WORKERS_PER_MACHINE];

  double received_invs_avg_coalesing[MAX_WORKERS_PER_MACHINE];
  double received_acks_avg_coalesing[MAX_WORKERS_PER_MACHINE];
  double received_vals_avg_coalesing[MAX_WORKERS_PER_MACHINE];
  double received_crds_avg_coalesing[MAX_WORKERS_PER_MACHINE];

  double percentage_of_wasted_loops[MAX_WORKERS_PER_MACHINE];
  double completed_reqs_per_loop[MAX_WORKERS_PER_MACHINE];

  //	long long issued_packet_acks_per_worker;
  double batch_size_per_worker[MAX_WORKERS_PER_MACHINE];
  double empty_reqs_per_worker[MAX_WORKERS_PER_MACHINE];
  double stalled_time_per_worker[MAX_WORKERS_PER_MACHINE];
  double average_coalescing_per_worker[MAX_WORKERS_PER_MACHINE];

  double acks_per_worker[MAX_WORKERS_PER_MACHINE];
  double invs_per_worker[MAX_WORKERS_PER_MACHINE];
  double updates_per_worker[MAX_WORKERS_PER_MACHINE];

  double write_ratio_per_worker[MAX_WORKERS_PER_MACHINE];
};

// init all stats to 0
static inline void
init_stats(struct worker_stats* w_stats)
{
  memset(w_stats, 0, sizeof(struct worker_stats) * MAX_WORKERS_PER_MACHINE);
}

void trace_init(struct spacetime_trace_command** trace, uint16_t worker_lid);
void* run_worker(void* arg);
void* print_stats_thread(void* no_arg);
void dump_latency_stats(void);

// Maybe inline these
uint8_t is_state_code(uint8_t code);
uint8_t is_input_code(uint8_t code);
uint8_t is_response_code(uint8_t code);
uint8_t is_bucket_state_code(uint8_t code);

int spawn_stats_thread(void);
char* code_to_str(uint8_t code);

void setup_kvs_buffs(spacetime_op_t** ops, spacetime_inv_t** inv_recv_ops,
                     spacetime_ack_t** ack_recv_ops,
                     spacetime_val_t** val_recv_ops);

extern dbit_vector_t* g_share_qs_barrier;
extern volatile struct worker_stats w_stats[MAX_WORKERS_PER_MACHINE];
#endif  // HERMES_UTIL_H


================================================
FILE: include/mica-herd/city.h
================================================
// city.h - cityhash-c
// CityHash on C
// Copyright (c) 2011-2012, Alexander Nusov
//
// - original copyright notice -
// Copyright (c) 2011 Google, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
// CityHash, by Geoff Pike and Jyrki Alakuijala
//
// This file provides a few functions for hashing strings. On x86-64
// hardware in 2011, CityHash64() is faster than other high-quality
// hash functions, such as Murmur.  This is largely due to higher
// instruction-level parallelism.  CityHash64() and CityHash128() also perform
// well on hash-quality tests.
//
// CityHash128() is optimized for relatively long strings and returns
// a 128-bit hash.  For strings more than about 2000 bytes it can be
// faster than CityHash64().
//
// Functions in the CityHash family are not suitable for cryptography.
//
// WARNING: This code has not been tested on big-endian platforms!
// It is known to work well on little-endian platforms that have a small penalty
// for unaligned reads, such as current Intel and AMD moderate-to-high-end CPUs.
//
// By the way, for some hash functions, given strings a and b, the hash
// of a+b is easily derived from the hashes of a and b.  This property
// doesn't hold for any hash functions in this file.

#ifndef CITY_HASH_H_
#define CITY_HASH_H_

#include <stdint.h>
#include <stdlib.h>

typedef uint8_t uint8;
typedef uint32_t uint32;
typedef uint64_t uint64;

typedef struct _uint128 uint128;
struct _uint128 {
  uint64 first;
  uint64 second;
};

#define Uint128Low64(x) (x).first
#define Uint128High64(x) (x).second

// Hash function for a byte array.
uint64 CityHash64(const char* buf, size_t len);

// Hash function for a byte array.  For convenience, a 64-bit seed is also
// hashed into the result.
uint64 CityHash64WithSeed(const char* buf, size_t len, uint64 seed);

// Hash function for a byte array.  For convenience, two seeds are also
// hashed into the result.
uint64 CityHash64WithSeeds(const char* buf, size_t len, uint64 seed0,
                           uint64 seed1);

// Hash function for a byte array.
uint128 CityHash128(const char* s, size_t len);

// Hash function for a byte array.  For convenience, a 128-bit seed is also
// hashed into the result.
uint128 CityHash128WithSeed(const char* s, size_t len, uint128 seed);

#endif  // CITY_HASH_H_


================================================
FILE: include/mica-herd/hrd.h
================================================
#ifndef HRD_H
#define HRD_H

#include <assert.h>
#include <errno.h>
#include <numaif.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>

#include <infiniband/verbs.h>
#include <libmemcached/memcached.h>
#include <malloc.h>
#include <time.h>
#include "sizes.h"

//<vasilis> Multicast
// TODO we do not use hw multicast because it helps only on master-based
// patterns
//#include <rdma/rdma_cma.h>
#include <arpa/inet.h>
#include <byteswap.h>
#include <netdb.h>
#include <netinet/in.h>
#include <sys/socket.h>
// <vasilis>

#define USE_BIG_OBJECTS 0
#define EXTRA_CACHE_LINES 0
#define BASE_VALUE_SIZE 46  // max is --> 46
#define SHIFT_BITS \
  (USE_BIG_OBJECTS == 1 ? 3 : 0)  // number of bits to shift left or right to
                                  // calculate the value length
#define HRD_DEFAULT_PSN \
  3185 /* PSN for all queues */  // starting Packet Sequence Number
#define HRD_DEFAULT_QKEY 0x11111111

#define HRD_QP_NAME_SIZE 200 /* Size (in bytes) of a queue pair name */
#define HRD_RESERVED_NAME_PREFIX "__HRD_RESERVED_NAME_PREFIX"

#define KVS_VALUE_SIZE                                \
  (USE_BIG_OBJECTS == 1                               \
       ? ((EXTRA_CACHE_LINES * 64) + BASE_VALUE_SIZE) \
       : BASE_VALUE_SIZE)  //(169 + 64)// 46 + 64 + 64//32 //(46 + 64)

#define HUGE_PAGE_SIZE 2097152
#define LEVERAGE_TLB_COALESCING 1

/*
 * Small max_inline_data reduces the QP's max WQE size, which reduces the
 * DMA size in doorbell method of WQE fetch.
 */
#define HRD_MAX_INLINE \
  188  //(USE_BIG_OBJECTS == 1 ? ((EXTRA_CACHE_LINES * 64) + 60) : 60) //60 is
       // what kalia had here//

// This is required for ROCE not sure yet why
// <vasilis>
#define IB_PHYS_PORT 1
// </vasilis>
// <akatsarakis>
#define USE_HUGE_PAGES 1
// </akatsarakis>

#ifndef likely
#define likely(x) __builtin_expect(!!(x), 1)
#endif

#ifndef unlikely
#define unlikely(x) __builtin_expect(!!(x), 0)
#endif

/* Compare, print, and exit */
#define CPE(val, msg, err_code)                \
  if (unlikely(val)) {                         \
    fprintf(stderr, msg);                      \
    fprintf(stderr, " Error %d \n", err_code); \
    exit(err_code);                            \
  }

/* vasilis added a ceiling and a MAX*/
#define CEILING(x, y) (((x) + (y)-1) / (y))
#define MAX(x, y) (x > y ? x : y)

int is_roce;
int machine_id;
char *remote_IP, *local_IP;

/* Registry info about a QP */
struct hrd_qp_attr {
  char name[HRD_QP_NAME_SIZE];

  // ROCE
  uint64_t
      gid_global_interface_id;  // Store the gid fields separately because I
  uint64_t gid_global_subnet_prefix;  // don't like unions. Needed for RoCE only

  /* Info about the RDMA buffer associated with this QP */
  uintptr_t buf_addr;
  uint32_t buf_size;
  uint32_t rkey;

  int lid;
  int qpn;
  uint8_t sl;
};

struct hrd_ud_ctrl_blk {
  int local_hid; /* Local ID on the machine this process runs on */

  /* Info about the device/port to use for this control block */
  struct ibv_context* ctx;
  int device_id;    /* Resovled by libhrd from @port_index */
  int dev_port_id;  /* 1-based within dev @device_id. Resolved by libhrd */
  int numa_node_id; /* NUMA node id */

  struct ibv_pd* pd; /* A protection domain for this control block */

  /* Datagram QPs */
  int num_dgram_qps;
  struct ibv_qp** dgram_qp;
  struct ibv_cq **dgram_send_cq, **dgram_recv_cq;
  volatile uint8_t* dgram_buf; /* A buffer for RECVs on dgram QPs */
  int* recv_q_depth;
  int* send_q_depth;
  int dgram_buf_shm_key;
  struct ibv_mr* dgram_buf_mr;
};

/* Major initialzation functions */

struct hrd_ud_ctrl_blk* hrd_ud_ctrl_blk_init(
    int local_hid, int port_index,
    int numa_node_id, /* -1 means don't use hugepages */
    int num_dgram_qps, int dgram_buf_size, int dgram_buf_shm_key,
    int* recv_q_depth, int* send_q_depth);

int hrd_ud_ctrl_blk_destroy(struct hrd_ud_ctrl_blk* cb);

/* RDMA resolution functions */
struct ibv_device* hrd_resolve_port_index(struct hrd_ud_ctrl_blk* cb,
                                          int port_index);

uint16_t hrd_get_local_lid(struct ibv_context* ctx, int port_id);

void hrd_create_dgram_qps(struct hrd_ud_ctrl_blk* cb);

/* Fill @wc with @num_comps comps from this @cq. Exit on error. */
static inline uint32_t
hrd_poll_cq(struct ibv_cq* cq, int num_comps, struct ibv_wc* wc)
{
  int comps = 0;
  uint32_t debug_cnt = 0;
  while (comps < num_comps) {
    if (debug_cnt > M_256) {
      printf("Someone is stuck waiting for a completion %d / %d  \n", comps,
             num_comps);
      debug_cnt = 0;
    }
    int new_comps = ibv_poll_cq(cq, num_comps - comps, &wc[comps]);
    if (new_comps != 0) {
      // printf("I see completions %d\n", new_comps);
      /* Ideally, we should check from comps -> new_comps - 1 */
      if (wc[comps].status != 0) {
        fprintf(stderr, "Bad wc status %d\n", wc[comps].status);
        exit(0);
      }
      comps += new_comps;
    }
    debug_cnt++;
  }
  return debug_cnt;
}

static inline struct ibv_mr*
register_buffer(struct ibv_pd* pd, void* buf, uint32_t size)
{
  int ib_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ |
                 IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC;
  struct ibv_mr* mr = ibv_reg_mr(pd, (char*)buf, size, ib_flags);
  assert(mr != NULL);
  return mr;
}

/* Registry functions */
void hrd_publish(const char* key, void* value, int len);
int hrd_get_published(const char* key, void** value);

///* Publish the nth connected queue pair from this cb with this name */
// void hrd_publish_conn_qp(struct hrd_ud_ctrl_blk *cb, int n, const char
// *qp_name);

/* Publish the nth datagram queue pair from this cb with this name */
void hrd_publish_dgram_qp(struct hrd_ud_ctrl_blk* cb, int n,
                          const char* qp_name, uint8_t sl);

struct hrd_qp_attr* hrd_get_published_qp(const char* qp_name);

/* Utility functions */
static inline uint32_t
hrd_fastrand(uint64_t* seed)
{
  *seed = *seed * 1103515245 + 12345;
  return (uint32_t)(*seed >> 32);
}

void* hrd_malloc_socket(int shm_key, uint64_t size, int socket_id);
int hrd_free(int shm_key, void* shm_buf);
char* hrd_getenv(const char* name);

// Like printf, but colorfur. Limited to 1000 characters.
typedef enum { YELLOW = 0, RED, GREEN, CYAN } color_print_t;
void colored_printf(color_print_t color, const char* format, ...);

extern char dev_name[50];
#endif /* HRD_H */


================================================
FILE: include/mica-herd/mica.h
================================================
#ifndef MICA_H
#define MICA_H

#include <stdint.h>
#include "city.h"
#include "hrd.h"

/*
 * The polling logic in HERD requires the following:
 * 1. 0 < MICA_OP_GET < MICA_OP_PUT < HERD_OP_GET < HERD_OP_PUT
 * 2. HERD_OP_GET = MICA_OP_GET + HERD_MICA_OFFSET
 * 3. HERD_OP_PUT = MICA_OP_PUT + HERD_MICA_OFFSET
 *
 * This allows us to detect HERD requests by checking if the request region
 * opcode is more than MICA_OP_PUT. And then we can convert a HERD opcode to
 * a MICA opcode by subtracting HERD_MICA_OFFSET from it.
 */
#define MICA_OP_PUT 112

/* Ensure that a mica_op is cacheline aligned */
#define MICA_OP_METADATA \
  (sizeof(struct mica_key) + sizeof(uint8_t) + sizeof(uint8_t))
#define MICA_MIN_VALUE (64 - MICA_OP_METADATA)
#define MICA_MAX_VALUE                                                \
  (USE_BIG_OBJECTS == 1 ? (MICA_MIN_VALUE + (EXTRA_CACHE_LINES * 64)) \
                        : MICA_MIN_VALUE)

#define MICA_LOG_BITS 40

#define MICA_INDEX_SHM_KEY 3185
#define MICA_LOG_SHM_KEY 4185

/*
 * Debug values:
 * 0: No safety checks on fast path
 * 1: Sanity checks for arguments
 * 2: Pretty print GET/PUT operations
 */

#define MICA_DEBUG 0

struct mica_resp {
  uint8_t type;
  uint8_t val_len;
  uint16_t unused[3]; /* Make val_ptr 8-byte aligned */
  uint8_t* val_ptr;
};

/* Fixed-size 16 byte keys */
struct mica_key {
  unsigned long long __unused : 64;
  unsigned int bkt : 32;
  unsigned int server : 16;
  unsigned int tag : 16;
};

struct mica_op {
  struct mica_key key; /* This must be the 1st field and 16B aligned */
  uint8_t opcode;
  uint8_t val_len;
  uint8_t value[MICA_MAX_VALUE];
};

struct mica_slot {
  uint32_t in_use : 1;
  uint32_t tag : (64 - MICA_LOG_BITS - 1);
  uint64_t offset : MICA_LOG_BITS;
};

struct mica_bkt {
  struct mica_slot slots[8];
};

struct mica_kv {
  struct mica_bkt* ht_index;
  uint8_t* ht_log;

  /* Metadata */
  int instance_id; /* ID of this MICA instance. Used for shm keys */

  uint64_t num_bkts; /* Number of buckets requested by user */
  uint64_t bkt_mask; /* Mask down from a mica_key's @bkt to a bucket */

  uint64_t log_cap;  /* Capacity of circular log in bytes */
  uint64_t log_mask; /* Mask down from a slot's @offset to a log offset */

  /* State */
  uint64_t log_head;

  /* Stats */
  long long num_insert_op;       /* Number of PUT requests executed */
  long long num_index_evictions; /* Number of entries evicted from index */
};

void mica_init(struct mica_kv* kv, int instance_id, int node_id, int num_bkts,
               uint64_t log_cap);

/* Single-key INSERT */
void mica_insert_one(struct mica_kv* kv, struct mica_op* op,
                     struct mica_resp* res);

/* Helpers */
uint128* mica_gen_keys(int n);

///* Debug functions */
void mica_print_op(struct mica_op* op);

#endif


================================================
FILE: include/mica-herd/sizes.h
================================================
#define K_32 32768

#define K_64 65536

#define K_128 131072
#define K_128_ 131071

#define K_256 262144
#define K_256_ 262143

#define K_512 524288
#define K_512_ 524287

#define M_1 1048576
#define M_1_ 1048575

#define M_2 2097152
#define M_2_ 2097151

#define M_4 4194304
#define M_4_ 4194303

#define M_8 8388608
#define M_8_ 8388607

#define M_16 16777216
#define M_16_ 16777215

#define M_32 33554432
#define M_32_ 33554431

#define M_128 134217728
#define M_128_ 134217727

#define M_256 268435456
#define M_256_ 268435455

#define M_512 536870912
#define M_512_ 536870911

#define M_1024 1073741824
#define M_1024_ 1073741823

#define MILLION 1000000


================================================
FILE: include/utils/bit_vector.h
================================================
//
// Created by akatsarakis on 11/12/18.
//

#ifndef HERMES_BIT_VECTOR_H
#define HERMES_BIT_VECTOR_H

#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// Change accordingly
#define BV_BIT_VECTOR_SIZE \
  8  // Set if you use statical bit vector (bit_vector_t)
#define BV_ENABLE_BIT_VECTOR_ASSERTS 1

// Do not change the following defines
#define BV_CEILING(x, y) (((x) + (y)-1) / (y))
#define BV_BITS_IN_A_BYTE 8

#define BV_BIT_VECTOR_SIZE_IN_BYTES \
  BV_CEILING(BV_BIT_VECTOR_SIZE, BV_BITS_IN_A_BYTE)

#define BV_BIT_SLOT(bit) (bit / BV_BITS_IN_A_BYTE)
#define BV_BIT_MOD(bit) ((uint8_t)1 << bit % BV_BITS_IN_A_BYTE)

// print binary numbers
#define BYTE_TO_BINARY_PATTERN "%c%c%c%c%c%c%c%c"
#define BYTE_TO_BINARY(byte)                                \
  (byte & 0x80 ? '1' : '0'), (byte & 0x40 ? '1' : '0'),     \
      (byte & 0x20 ? '1' : '0'), (byte & 0x10 ? '1' : '0'), \
      (byte & 0x08 ? '1' : '0'), (byte & 0x04 ? '1' : '0'), \
      (byte & 0x02 ? '1' : '0'), (byte & 0x01 ? '1' : '0')

typedef struct {
  uint8_t bit_array[BV_BIT_VECTOR_SIZE_IN_BYTES];
} bit_vector_t;

typedef struct {
  uint8_t bv_size;     // in bits
  uint8_t* bit_array;  // bit_array len == ceil(bv_size / 8)
} dbit_vector_t;

// returns the least amount of bytes that required to store x bits
static inline uint16_t
bv_bits_to_bytes(uint16_t bits)
{
  return (uint16_t)BV_CEILING(bits, BV_BITS_IN_A_BYTE);
}

/////////////////////////////////////////
/// Internal Bitvector API functions (should not be called directly)
/////////////////////////////////////////

static inline void
bv_init_internal(uint8_t* bit_array, uint16_t size_in_bits)
{
  for (int i = 0; i < bv_bits_to_bytes(size_in_bits); ++i)
    bit_array[i] = 0;
}

static inline uint8_t
bv_bit_get_internal(const uint8_t* bit_array, uint16_t size_in_bits,
                    uint8_t bit)
{
  if (BV_ENABLE_BIT_VECTOR_ASSERTS) assert(bit < size_in_bits);

  return (uint8_t)((bit_array[BV_BIT_SLOT(bit)] & BV_BIT_MOD(bit)) == 0 ? 0
                                                                        : 1);
}

static inline void
bv_bit_set_internal(uint8_t* bit_array, uint16_t size_in_bits, uint8_t bit)
{
  if (BV_ENABLE_BIT_VECTOR_ASSERTS) assert(bit < size_in_bits);

  bit_array[BV_BIT_SLOT(bit)] |= BV_BIT_MOD(bit);
}

static inline void
bv_bit_reset_internal(uint8_t* bit_array, uint16_t size_in_bits, uint8_t bit)
{
  if (BV_ENABLE_BIT_VECTOR_ASSERTS) assert(bit < size_in_bits);

  bit_array[BV_BIT_SLOT(bit)] &= ~(BV_BIT_MOD(bit));
}

static inline void
bv_set_all_internal(uint8_t* bit_array, uint16_t size_in_bits)
{
  uint8_t bytes = (uint8_t)bv_bits_to_bytes(size_in_bits);
  uint8_t unused = (uint8_t)(bytes * 8 - size_in_bits);
  uint8_t last_byte = (uint8_t)(255 >> unused);

  for (int i = 0; i < bytes - 1; ++i)
    bit_array[i] = 255;

  bit_array[bytes - 1] = last_byte;
}

static inline void
bv_reset_all_internal(uint8_t* bit_array, uint16_t size_in_bits)
{
  for (int i = 0; i < bv_bits_to_bytes(size_in_bits); ++i)
    bit_array[i] = 0;
}

static inline uint8_t
bv_are_equal_internal(uint8_t* ba1, uint16_t size_in_bits1, uint8_t* ba2,
                      uint16_t size_in_bits2)
{
  if (size_in_bits1 != size_in_bits2) return 0;

  uint16_t size_in_bytes = bv_bits_to_bytes(size_in_bits1);

  // shift the unused bits to avoid failing due to them
  // (difference only in the unused bits)
  uint8_t unused_ms_bits =
      (uint8_t)(BV_BITS_IN_A_BYTE * size_in_bytes - size_in_bits1);
  uint8_t last_byte1 = ba1[size_in_bytes - 1] << unused_ms_bits;
  uint8_t last_byte2 = ba2[size_in_bytes - 1] << unused_ms_bits;

  return (uint8_t)(memcmp(ba1, ba2, (size_t)(size_in_bytes - 1)) == 0 &&
                           last_byte1 == last_byte2
                       ? 1
                       : 0);
}

static inline void
bv_copy_internal(uint8_t* ba_dst, uint16_t size_in_bits_dst, uint8_t* ba_src,
                 uint16_t size_in_bits_src)
{
  // allow copy only if sizes match
  if (size_in_bits_dst != size_in_bits_src) assert(0);

  memcpy(ba_dst, ba_src, bv_bits_to_bytes(size_in_bits_src));
}

static inline uint8_t
bv_no_setted_bits_internal(uint8_t* bit_array, uint16_t size_in_bits)
{
  uint8_t cnt = 0;
  for (uint8_t i = 0; i < size_in_bits; ++i)
    cnt += bv_bit_get_internal(bit_array, size_in_bits, i);
  return cnt;
}

/// Bitvector Bitwise ops internal

static inline void
bv_reverse_internal(uint8_t* bit_array, uint16_t size_in_bits)
{
  for (int i = 0; i < bv_bits_to_bytes(size_in_bits); ++i)
    bit_array[i] = ~bit_array[i];
}

static inline void
bv_and_internal(uint8_t* ba_dst, uint16_t size_in_bits_dst,
                const uint8_t* ba_src, uint16_t size_in_bits_src)
{
  // allow and only if sizes match
  if (size_in_bits_dst != size_in_bits_src) assert(0);

  for (int i = 0; i < bv_bits_to_bytes(size_in_bits_dst); ++i)
    ba_dst[i] &= ba_src[i];
}

static inline void
bv_or_internal(uint8_t* ba_dst, uint16_t size_in_bits_dst,
               const uint8_t* ba_src, uint16_t size_in_bits_src)
{
  // allow or only if sizes match
  if (size_in_bits_dst != size_in_bits_src) assert(0);

  for (int i = 0; i < bv_bits_to_bytes(size_in_bits_dst); ++i)
    ba_dst[i] |= ba_src[i];
}

/// Bitvector Print functions

static inline void
bv_print_internal(const uint8_t* bit_array, uint16_t size_in_bits)
{
  for (int i = bv_bits_to_bytes(size_in_bits) - 1; i >= 0; --i)
    printf(BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(bit_array[i]));
}

static inline void
bv_print_enhanced_internal(const uint8_t* bit_array, uint16_t size_in_bits)
{
  printf("Bit vector: ");
  bv_print_internal(bit_array, size_in_bits);
  printf("\n");
}

/////////////////////////////////////////
/// Dynamic Bitvector API functions
/////////////////////////////////////////
static inline void
dbv_init(dbit_vector_t** bv, uint8_t size)
{
  uint16_t bv_size_in_bytes = bv_bits_to_bytes(size);
  *bv = malloc(sizeof(dbit_vector_t));
  (*bv)->bit_array = malloc(bv_size_in_bytes * sizeof(uint8_t));
  (*bv)->bv_size = size;
  bv_init_internal((*bv)->bit_array, size);
}

static inline void
dbv_destroy(dbit_vector_t* bv)
{
  free(bv->bit_array);
  free(bv);
}

static inline uint8_t
dbv_bit_get(dbit_vector_t bv, int bit)
{
  return bv_bit_get_internal(bv.bit_array, bv.bv_size, bit);
}

static inline void
dbv_bit_set(dbit_vector_t* bv, uint8_t bit)
{
  bv_bit_set_internal(bv->bit_array, bv->bv_size, bit);
}

static inline void
dbv_bit_reset(dbit_vector_t* bv, uint8_t bit)
{
  bv_bit_reset_internal(bv->bit_array, bv->bv_size, bit);
}

static inline void
dbv_set_all(dbit_vector_t* bv)
{
  bv_set_all_internal(bv->bit_array, bv->bv_size);
}

static inline void
dbv_reset_all(dbit_vector_t* bv)
{
  bv_reset_all_internal(bv->bit_array, bv->bv_size);
}

static inline uint8_t
dbv_no_setted_bits(dbit_vector_t bv)
{
  return bv_no_setted_bits_internal(bv.bit_array, bv.bv_size);
}

static inline uint8_t
dbv_are_equal(dbit_vector_t bv1, dbit_vector_t bv2)
{
  return bv_are_equal_internal(bv1.bit_array, bv1.bv_size, bv2.bit_array,
                               bv2.bv_size);
}

static inline void
dbv_copy(dbit_vector_t* bv_dst, dbit_vector_t bv_src)
{
  bv_copy_internal(bv_dst->bit_array, bv_dst->bv_size, bv_src.bit_array,
                   bv_src.bv_size);
}

static inline uint8_t
dbv_is_all_set(dbit_vector_t bv)
{
  dbit_vector_t* bv_tmp;
  dbv_init(&bv_tmp, bv.bv_size);
  dbv_set_all(bv_tmp);
  return dbv_are_equal(bv, *bv_tmp);
}

/// Bitvector bitwise ops
static inline void
dbv_reverse(dbit_vector_t* bv)
{
  bv_reverse_internal(bv->bit_array, bv->bv_size);
}

static inline void
dbv_and(dbit_vector_t* bv_dst, dbit_vector_t bv_src)
{
  bv_and_internal(bv_dst->bit_array, bv_dst->bv_size, bv_src.bit_array,
                  bv_src.bv_size);
}

static inline void
dbv_or(dbit_vector_t* bv_dst, dbit_vector_t bv_src)
{
  bv_or_internal(bv_dst->bit_array, bv_dst->bv_size, bv_src.bit_array,
                 bv_src.bv_size);
}

/// Bitvector Print functions

static inline void
dbv_print(dbit_vector_t bv)
{
  bv_print_internal(bv.bit_array, bv.bv_size);
}

static inline void
dbv_print_enhanced(dbit_vector_t bv)
{
  bv_print_enhanced_internal(bv.bit_array, bv.bv_size);
}

static inline void
dbv_unit_test(void)
{
  dbit_vector_t* bv;
  dbit_vector_t* bv_set__all;
  dbv_init(&bv, 22);
  dbv_init(&bv_set__all, 22);
  dbv_set_all(bv_set__all);

  for (uint8_t i = 0; i < bv->bv_size; ++i)
    dbv_bit_set(bv, i);
  assert(dbv_are_equal(*bv, *bv_set__all) == 1);

  for (uint8_t i = 0; i < bv->bv_size; ++i)
    dbv_bit_reset(bv, i);
  dbv_reverse(bv);
  assert(dbv_are_equal(*bv, *bv_set__all) == 1);

  for (uint8_t i = 0; i < bv->bv_size; ++i)
    if (i % 2 == 0) {
      dbv_bit_reset(bv, i);
      assert(dbv_bit_get(*bv, i) == 0);
    } else {
      dbv_bit_set(bv, i);
      assert(dbv_bit_get(*bv, i) == 1);
    }

  dbv_reset_all(bv);
  assert(dbv_are_equal(*bv, *bv_set__all) == 0);

  dbv_set_all(bv);
  dbv_and(bv, *bv_set__all);
  assert(dbv_are_equal(*bv, *bv_set__all) == 1);

  dbv_copy(bv, *bv_set__all);
  assert(dbv_are_equal(*bv, *bv_set__all) == 1);

  dbv_reset_all(bv);
  dbv_or(bv, *bv_set__all);
  assert(dbv_are_equal(*bv, *bv_set__all) == 1);
  printf("Dynamic Bit Vector Unit Test was Successful!\n");
}

/////////////////////////////////////////
/// Static Bitvector API functions
/////////////////////////////////////////

static inline void
bv_init(bit_vector_t* bv)
{
  bv_init_internal(bv->bit_array, BV_BIT_VECTOR_SIZE);
}

static inline uint8_t
bv_bit_get(bit_vector_t bv, int bit)
{
  return bv_bit_get_internal(bv.bit_array, BV_BIT_VECTOR_SIZE, bit);
}

static inline void
bv_bit_set(bit_vector_t* bv, uint8_t bit)
{
  bv_bit_set_internal(bv->bit_array, BV_BIT_VECTOR_SIZE, bit);
}

static inline void
bv_bit_reset(bit_vector_t* bv, uint8_t bit)
{
  bv_bit_reset_internal(bv->bit_array, BV_BIT_VECTOR_SIZE, bit);
}

static inline void
bv_set_all(bit_vector_t* bv)
{
  bv_set_all_internal(bv->bit_array, BV_BIT_VECTOR_SIZE);
}

static inline void
bv_reset_all(bit_vector_t* bv)
{
  bv_reset_all_internal(bv->bit_array, BV_BIT_VECTOR_SIZE);
}

static inline uint8_t
bv_no_setted_bits(bit_vector_t bv)
{
  return bv_no_setted_bits_internal(bv.bit_array, BV_BIT_VECTOR_SIZE);
}

static inline uint8_t
bv_are_equal(bit_vector_t bv1, bit_vector_t bv2)
{
  return bv_are_equal_internal(bv1.bit_array, BV_BIT_VECTOR_SIZE, bv2.bit_array,
                               BV_BIT_VECTOR_SIZE);
}

static inline void
bv_copy(bit_vector_t* bv_dst, bit_vector_t bv_src)
{
  bv_copy_internal(bv_dst->bit_array, BV_BIT_VECTOR_SIZE, bv_src.bit_array,
                   BV_BIT_VECTOR_SIZE);
}

/// Bitvector bitwise ops
static inline void
bv_reverse(bit_vector_t* bv)
{
  bv_reverse_internal(bv->bit_array, BV_BIT_VECTOR_SIZE);
}

static inline void
bv_and(bit_vector_t* bv_dst, bit_vector_t bv_src)
{
  bv_and_internal(bv_dst->bit_array, BV_BIT_VECTOR_SIZE, bv_src.bit_array,
                  BV_BIT_VECTOR_SIZE);
}

static inline void
bv_or(bit_vector_t* bv_dst, bit_vector_t bv_src)
{
  bv_or_internal(bv_dst->bit_array, BV_BIT_VECTOR_SIZE, bv_src.bit_array,
                 BV_BIT_VECTOR_SIZE);
}

/// Bitvector Print functions

static inline void
bv_print(bit_vector_t bv)
{
  bv_print_internal(bv.bit_array, BV_BIT_VECTOR_SIZE);
}

static inline void
bv_print_enhanced(bit_vector_t bv)
{
  bv_print_enhanced_internal(bv.bit_array, BV_BIT_VECTOR_SIZE);
}

/////////////////////////////////////////
/// Bitvector unit test functions
/////////////////////////////////////////
static inline void
bv_unit_test(void)
{
  bit_vector_t bv;
  bit_vector_t bv_set__all;
  bv_init(&bv);
  bv_set_all(&bv_set__all);

  dbv_unit_test();

  for (uint8_t i = 0; i < BV_BIT_VECTOR_SIZE; ++i)
    bv_bit_set(&bv, i);
  assert(bv_are_equal(bv, bv_set__all) == 1);

  for (uint8_t i = 0; i < BV_BIT_VECTOR_SIZE; ++i)
    bv_bit_reset(&bv, i);
  bv_reverse(&bv);
  assert(bv_are_equal(bv, bv_set__all) == 1);

  for (uint8_t i = 0; i < BV_BIT_VECTOR_SIZE; ++i)
    if (i % 2 == 0) {
      bv_bit_reset(&bv, i);
      assert(bv_bit_get(bv, i) == 0);
    } else {
      bv_bit_set(&bv, i);
      assert(bv_bit_get(bv, i) == 1);
    }

  bv_reset_all(&bv);
  assert(bv_are_equal(bv, bv_set__all) == 0);

  bv_set_all(&bv);
  bv_and(&bv, bv_set__all);
  assert(bv_are_equal(bv, bv_set__all) == 1);

  bv_copy(&bv, bv_set__all);
  assert(bv_are_equal(bv, bv_set__all) == 1);

  bv_reset_all(&bv);
  bv_or(&bv, bv_set__all);
  assert(bv_are_equal(bv, bv_set__all) == 1);
  printf("Static  Bit Vector Unit Test was Successful!\n");
}

#endif  // HERMES_BIT_VECTOR_H


================================================
FILE: include/utils/concur_ctrl.h
================================================
//
// Created by akatsarakis on 11/12/18.
//

#ifndef HERMES_SEQLOCK_H
#define HERMES_SEQLOCK_H

#include <assert.h>
#include <stdint.h>

#define ENABLE_LOCK_ASSERTS 1

#define TIE_BREAKER_ID_EMPTY 255
#define SEQLOCK_LOCKED 0x1
#define SEQLOCK_FREE 0x0

#define LOCK_PAUSE() asm volatile("mfence");

#define COMPILER_BARRIER() asm volatile("" ::: "memory")

#if !defined(COMPILER_NO_REORDER)
#define COMPILER_NO_REORDER(exec) \
  COMPILER_BARRIER();             \
  exec;                           \
  COMPILER_BARRIER()
#endif

typedef volatile struct {
  uint8_t tie_breaker_id;
  uint32_t version;
} __attribute__((packed)) timestamp_t;

typedef struct {
  uint8_t lock;
  uint32_t version;  /// for lock-free reads
} __attribute__((packed)) seqlock_t;

typedef volatile struct {
  uint8_t lock;
  timestamp_t
      ts;  /// ts.version used for both lock-free reads & as part of timestamp
} __attribute__((packed)) conc_ctrl_t;

/////////////////////////////////////////
/// Timestamp  comparison  functions
/////////////////////////////////////////
static inline void
timestamp_init(timestamp_t* ts)
{
  ts->version = 0;
  ts->tie_breaker_id = TIE_BREAKER_ID_EMPTY;
}

static inline int
timestamp_is_equal(uint32_t v1, uint8_t tie_breaker1, uint32_t v2,
                   uint8_t tie_breaker2)
{
  return (v1 == v2 && tie_breaker1 == tie_breaker2);
}

static inline int
timestamp_is_smaller(uint32_t v1, uint8_t tie_breaker1, uint32_t v2,
                     uint8_t tie_breaker2)
{
  return (v1 < v2 || (v1 == v2 && tie_breaker1 < tie_breaker2));
}

/////////////////////////////////////////
/// seqlock locking / unlocking functions
/////////////////////////////////////////

static inline void
seqlock_init(seqlock_t* seqlock)
{
  seqlock->version = 0;
  seqlock->lock = SEQLOCK_FREE;
}

static inline int
seqlock_lock(seqlock_t* seqlock)
{
  do {
    // Spin until the seqlock is unlocked
    while (seqlock->lock == SEQLOCK_LOCKED) {
      LOCK_PAUSE();
    }

    // try to atomically get the lock via a CAS
    if (__sync_val_compare_and_swap(&seqlock->lock, 0, 1) == 0) {
      seqlock->version++;
      break;
    }

  } while (1);  // retry if CAS failed

  return 1;
}

static inline void
seqlock_unlock(seqlock_t* seqlock)
{
  if (ENABLE_LOCK_ASSERTS) {
    assert(seqlock->lock == SEQLOCK_LOCKED);
    assert(seqlock->version % 2 == 1);
  }

  COMPILER_NO_REORDER(seqlock->version++);
  COMPILER_NO_REORDER(seqlock->lock = SEQLOCK_FREE);
}

// This is used to validate a lock-free read
// i.e. --> do { <Lock free read>  } while
// (!(seqlock_version_is_same_and_valid(...));
static inline int
seqlock_version_is_same_and_valid(seqlock_t* seqlock1, seqlock_t* seqlock2)
{
  return (seqlock1->version == seqlock2->version && seqlock1->version % 2 == 0);
}

/////////////////////////////////////////
/// ccctrl locking / unlocking functions
/////////////////////////////////////////

static inline void
cctrl_init(conc_ctrl_t* cctrl)
{
  timestamp_init(&cctrl->ts);
  cctrl->lock = SEQLOCK_FREE;
}

static inline int
cctrl_lock(conc_ctrl_t* cctrl)
{
  do {
    // Spin until the seqlock is unlocked
    while (cctrl->lock == SEQLOCK_LOCKED) {
      LOCK_PAUSE();
    }

    // try to atomically get the lock via a CAS
    if (__sync_val_compare_and_swap(&cctrl->lock, 0, 1) == 0) {
      cctrl->ts.version++;
      break;
    }

  } while (1);  // retry if CAS failed

  return 1;
}

static inline void
cctrl_unlock_custom_version(conc_ctrl_t* cctrl, uint8_t cid, uint32_t version)
{
  if (ENABLE_LOCK_ASSERTS) {
    assert(cctrl->lock == SEQLOCK_LOCKED);
    assert(cctrl->ts.version % 2 == 1);
  }

  cctrl->ts.tie_breaker_id = cid;
  COMPILER_NO_REORDER(cctrl->ts.version = version);
  COMPILER_NO_REORDER(cctrl->lock = SEQLOCK_FREE);
}

static inline void
cctrl_unlock_inc_version_by_three(conc_ctrl_t* cctrl, uint8_t cid,
                                  uint32_t* resp_version)
{
  if (ENABLE_LOCK_ASSERTS) {
    assert(cctrl->lock == SEQLOCK_LOCKED);
    assert(cctrl->ts.version % 2 == 1);
  }

  cctrl->ts.tie_breaker_id = cid;
  COMPILER_NO_REORDER(cctrl->ts.version += 3);
  COMPILER_NO_REORDER(*resp_version = cctrl->ts.version);
  COMPILER_NO_REORDER(cctrl->lock = SEQLOCK_FREE);
}

static inline void
cctrl_unlock_inc_version(conc_ctrl_t* cctrl, uint8_t cid,
                         uint32_t* resp_version)
{
  if (ENABLE_LOCK_ASSERTS) {
    assert(cctrl->lock == SEQLOCK_LOCKED);
    assert(cctrl->ts.version % 2 == 1);
  }

  cctrl->ts.tie_breaker_id = cid;
  COMPILER_NO_REORDER(*resp_version = ++cctrl->ts.version);
  COMPILER_NO_REORDER(cctrl->lock = SEQLOCK_FREE);
}

static inline void
cctrl_unlock_dec_version(conc_ctrl_t* cctrl)
{
  if (ENABLE_LOCK_ASSERTS) {
    assert(cctrl->lock == SEQLOCK_LOCKED);
    assert(cctrl->ts.version % 2 == 1);
  }

  // keep same ts.tie_breaker_id
  COMPILER_NO_REORDER(cctrl->ts.version--);
  COMPILER_NO_REORDER(cctrl->lock = SEQLOCK_FREE);
}

// This is used to validate a lock-free read
// i.e. --> do { <Lock free read>  } while
// (!(cctrl_timestamp_is_same_and_valid(...));
static inline int
cctrl_timestamp_is_same_and_valid(volatile conc_ctrl_t* cctrl1,
                                  volatile conc_ctrl_t* cctrl2)
{
  return cctrl1->ts.version % 2 == 0 &&
         timestamp_is_equal(cctrl1->ts.version, cctrl1->ts.tie_breaker_id,
                            cctrl2->ts.version, cctrl2->ts.tie_breaker_id);
}

#endif  // HERMES_SEQLOCK_H


================================================
FILE: include/utils/time_rdtsc.h
================================================

#ifndef HERMES_TIME_H
#define HERMES_TIME_H
#include <assert.h>
#include <stdint.h> /* for uint64_t */
#include <stdio.h>
#include <time.h> /* for struct timespec */

#define ENABLE_STATIC_TICKS_PER_NS 1
#define RDTSC_TYPICAL_TICKS_PER_NS 2.2

double g_ticks_per_ns;

// assembly code to read the TSC
static inline uint64_t
RDTSC()
{
  unsigned int hi, lo;
  __asm__ volatile("rdtsc" : "=a"(lo), "=d"(hi));
  return ((uint64_t)hi << 32) | lo;
}

static const int NANO_SECONDS_IN_SEC = 1000000000;
// returns a static buffer of struct timespec with the time difference of
// ts1 and ts2 ts1 is assumed to be greater than ts2
static struct timespec*
timespec_diff(struct timespec* ts1, struct timespec* ts2)
{
  static struct timespec ts;
  ts.tv_sec = ts1->tv_sec - ts2->tv_sec;
  ts.tv_nsec = ts1->tv_nsec - ts2->tv_nsec;
  if (ts.tv_nsec < 0) {
    ts.tv_sec--;
    ts.tv_nsec += NANO_SECONDS_IN_SEC;
  }
  return &ts;
}

static void
calibrate_ticks()
{
  struct timespec begin_ts, end_ts;
  printf("Start RDTSC calibration: patience is a virtue\n");
  clock_gettime(CLOCK_MONOTONIC, &begin_ts);
  uint64_t begin = RDTSC();
  // do something CPU intensive
  for (volatile unsigned long long i = 0; i < 1000000000ULL; ++i)
    ;
  uint64_t end = RDTSC();
  clock_gettime(CLOCK_MONOTONIC, &end_ts);
  struct timespec* tmp_ts = timespec_diff(&end_ts, &begin_ts);
  uint64_t ns_elapsed =
      (uint64_t)(tmp_ts->tv_sec * 1000000000LL + tmp_ts->tv_nsec);
  g_ticks_per_ns = (double)(end - begin) / (double)ns_elapsed;
  printf("RDTSC calibration is done (ticks_per_ns: %.2f)\n", g_ticks_per_ns);
}

// Call once (it is not thread safe) before using RDTSC, has side effect of
// binding process to CPU1
static inline void
init_rdtsc(uint8_t auto_calibration, double ticks_per_ns)
{
  if (auto_calibration > 0)
    calibrate_ticks();
  else {
    assert(ticks_per_ns > 0);
    g_ticks_per_ns = ticks_per_ns;
  }
}

static inline void
get_timespec(struct timespec* ts, uint64_t nsecs)
{
  ts->tv_sec = nsecs / NANO_SECONDS_IN_SEC;
  ts->tv_nsec = nsecs % NANO_SECONDS_IN_SEC;
}

// ts will be filled with time converted from TSC reading
static inline void
get_rdtsc_timespec(struct timespec* ts)
{
  get_timespec(ts, (uint64_t)(RDTSC() / g_ticks_per_ns));
}

static inline double
time_elapsed_in_us(struct timespec start)
{
  struct timespec now, *diff;
  get_rdtsc_timespec(&now);
  diff = timespec_diff(&now, &start);
  return diff->tv_sec * 1000000 + diff->tv_nsec / 1000;
}

static inline double
time_elapsed_in_ms(struct timespec start)
{
  struct timespec now, *diff;
  get_rdtsc_timespec(&now);
  diff = timespec_diff(&now, &start);
  return diff->tv_sec * 1000 + diff->tv_nsec / 1000000;
}

static inline double
time_elapsed_in_sec(struct timespec start)
{
  struct timespec now, *diff;
  get_rdtsc_timespec(&now);
  diff = timespec_diff(&now, &start);
  return diff->tv_sec + diff->tv_nsec / NANO_SECONDS_IN_SEC;
}

#endif  // HERMES_TIME_H


================================================
FILE: include/wings/wings.h
================================================
//
// Created by akatsarakis on 06/02/19.
//

#ifndef WINGS_INTERNAL_INLINES_H
#define WINGS_INTERNAL_INLINES_H

#include "wings_api.h"
/// WARNING!!
/// 	Functions starting with underscore (i.e. "_wings_*")
/// 	are internal and should not be called directly

void wings_reconfigure_wrs_ah(ud_channel_t* ud_c, uint8_t endpoint_id);

/* --------------------------------------------------------------------------
--------------------------------- Helper Functions --------------------------
---------------------------------------------------------------------------*/
static inline void
_wings_assert_binary(uint8_t var)
{
  assert(var == 0 || var == 1);
}

static inline uint16_t
_wings_ud_recv_max_pkt_size(ud_channel_t* ud_c)
{
  if (WINGS_ENABLE_ASSERTIONS)
    assert(ud_c->type != CRD && ud_c->is_header_only == 0);
  // TODO add assertion that this must be smaller than max_MTU
  assert(ud_c->max_msg_size > 0 && ud_c->max_coalescing > 0);
  return sizeof(wings_ud_recv_pkt_t) +
         ud_c->max_msg_size * ud_c->max_coalescing;
}

static inline uint16_t
_wings_ud_send_max_pkt_size(ud_channel_t* ud_c)
{
  if (WINGS_ENABLE_ASSERTIONS)
    assert(ud_c->type != CRD && ud_c->is_header_only == 0);
  // TODO add assertion that this must be smaller than max_MTU
  assert(ud_c->max_msg_size > 0 && ud_c->max_coalescing > 0);
  return sizeof(wings_ud_send_pkt_t) +
         ud_c->max_msg_size * ud_c->max_coalescing;
}

static inline void
_wings_assertions(ud_channel_t* ud_channel)
{
  _wings_assert_binary(ud_channel->expl_crd_ctrl);
  _wings_assert_binary(ud_channel->is_bcast_channel);
  _wings_assert_binary(ud_channel->is_inlining_enabled);

  assert(ud_channel->num_channels > 1);
  assert(ud_channel->max_msg_size > 0);
  assert(ud_channel->max_coalescing > 0);
  assert(_wings_ud_send_max_pkt_size(ud_channel) < MAX_MTU_SIZE);
  assert(ud_channel->send_q_depth > 0 || ud_channel->recv_q_depth > 0);
  assert(ud_channel->channel_providing_crds != NULL ||
         ud_channel->disable_crd_ctrl);
}

static inline uint8_t*
_wings_get_n_msg_ptr_from_send_pkt(ud_channel_t* ud_c, wings_ud_send_pkt_t* pkt,
                                   uint8_t n)
{
  if (WINGS_ENABLE_ASSERTIONS)
    assert(ud_c->type != CRD && ud_c->is_header_only == 0);
  assert(ud_c->max_coalescing > n && pkt->req_num >= n);
  //    return &pkt->reqs[n * ud_c->max_msg_size];
  return &pkt->reqs[n * ud_c->small_msg_size];
}

static inline uint8_t*
_wings_get_n_msg_ptr_from_recv_pkt(ud_channel_t* ud_c,
                                   wings_ud_recv_pkt_t* recv_pkt, uint8_t n)
{
  if (WINGS_ENABLE_ASSERTIONS)
    assert(ud_c->type != CRD && ud_c->is_header_only == 0);
  return _wings_get_n_msg_ptr_from_send_pkt(ud_c, &recv_pkt->pkt, n);
}

static inline wings_ud_send_pkt_t*
_wings_get_nth_pkt_ptr_from_send_buff(ud_channel_t* ud_c, uint16_t n)
{
  if (WINGS_ENABLE_ASSERTIONS)
    assert(ud_c->type != CRD && ud_c->is_header_only == 0);
  return (wings_ud_send_pkt_t*)&(
      (uint8_t*)ud_c->send_pkt_buff)[n * _wings_ud_send_max_pkt_size(ud_c)];
}

static inline wings_ud_recv_pkt_t*
_wings_get_nth_pkt_ptr_from_recv_buff(ud_channel_t* ud_c, uint16_t n)
{
  if (WINGS_ENABLE_ASSERTIONS)
    assert(ud_c->type != CRD && ud_c->is_header_only == 0);
  return (wings_ud_recv_pkt_t*)&ud_c
      ->recv_pkt_buff[n * _wings_ud_recv_max_pkt_size(ud_c)];
}

static inline wings_ud_send_pkt_t*
_wings_curr_send_pkt_ptr(ud_channel_t* ud_c)
{
  if (WINGS_ENABLE_ASSERTIONS)
    assert(ud_c->type != CRD && ud_c->is_header_only == 0);
  return _wings_get_nth_pkt_ptr_from_send_buff(ud_c,
                                               (uint16_t)ud_c->send_push_ptr);
}

static inline void
_wings_inc_send_push_ptr(ud_channel_t* ud_c)
{
  if (ud_c->is_header_only) return;

  if (WINGS_ENABLE_ASSERTIONS)
    assert(ud_c->type != CRD && ud_c->is_header_only == 0);

  if (ud_c->is_bcast_channel)
    WINGS_MOD_ADD(ud_c->send_push_ptr,
                  ud_c->send_pkt_buff_len);  // TODO change this to deal with
                                             // failures see comment below
  //      WINGS_MOD_ADD(*inv_push_ptr, INV_SEND_OPS_SIZE / MAX_REMOTE_MACHINES *
  //                               last_g_membership.num_of_alive_remotes);
  //                               //got to the next "packet" + dealing with
  //                               failutes
  else
    WINGS_MOD_ADD(ud_c->send_push_ptr, ud_c->send_pkt_buff_len);
  _wings_curr_send_pkt_ptr(ud_c)->req_num =
      0;  // Reset data left from previous unicasts / bcasts
}

static inline void
_wings_inc_recv_push_ptr(ud_channel_t* ud_c)
{
  if (WINGS_ENABLE_ASSERTIONS)
    assert(ud_c->type != CRD && ud_c->is_header_only == 0);
  WINGS_MOD_ADD(ud_c->recv_push_ptr, ud_c->recv_q_depth);
}

static inline void
_wings_inc_recv_pull_ptr(ud_channel_t* ud_c)
{
  if (WINGS_ENABLE_ASSERTIONS)
    assert(ud_c->type != CRD && ud_c->is_header_only == 0);
  WINGS_MOD_ADD(ud_c->recv_pull_ptr, ud_c->recv_pkt_buff_len);
}

/* ---------------------------------------------------------------------------
----------------------------------- RECVs ------------------------------------
---------------------------------------------------------------------------*/
static inline void
_wings_post_hdr_only_recvs(ud_channel_t* ud_c, uint16_t num_recvs)
{
  if (WINGS_ENABLE_ASSERTIONS)
    assert(ud_c->is_header_only || ud_c->type == CRD);

  struct ibv_recv_wr* bad_recv_wr;
  for (uint16_t i = 0; i < num_recvs; ++i)
    ud_c->recv_wr[i].next = (i == num_recvs - 1) ? NULL : &ud_c->recv_wr[i + 1];

  int ret = ibv_post_recv(ud_c->qp, ud_c->recv_wr, &bad_recv_wr);
  CPE(ret, "ibv_post_recv error: posting recvs for credits", ret);
}

static inline void
_wings_post_recvs(ud_channel_t* ud_c, uint16_t num_of_receives)
{
  if (WINGS_ENABLE_ASSERTIONS)
    assert(ud_c->type != CRD && ud_c->is_header_only == 0);

  void* next_buff_addr;

  if (WINGS_ENABLE_ASSERTIONS) assert(num_of_receives <= ud_c->max_recv_wrs);

  int req_size = _wings_ud_recv_max_pkt_size(ud_c);
  for (int i = 0; i < num_of_receives; ++i) {
    next_buff_addr =
        (void*)(ud_c->recv_pkt_buff) + (ud_c->recv_push_ptr * req_size);
    // TODO optimize by reseting only the req_num of wings_recv_pkt
    memset(next_buff_addr, 0,
           (size_t)req_size);  // reset the buffer before posting the receive

    if (WINGS_ENABLE_BATCH_POST_RECVS_TO_NIC)
      ud_c->recv_wr[i].sg_list->addr = (uintptr_t)next_buff_addr;
    else
      assert(0);
    //			hrd_post_dgram_recv(ud_c->qp, next_buff_addr, req_size,
    // cb->dgram_buf_mr->lkey);

    _wings_inc_recv_push_ptr(ud_c);
  }

  if (WINGS_ENABLE_BATCH_POST_RECVS_TO_NIC) {
    ud_c->recv_wr[num_of_receives - 1].next = NULL;
    if (WINGS_ENABLE_ASSERTIONS) {
      for (int i = 0; i < num_of_receives; i++) {
        assert(ud_c->recv_wr[i].num_sge == 1);
        assert(ud_c->recv_wr[i].sg_list->length == req_size);
        // TODO add
        //				assert(ud_c->recv_wr[i].sg_list->lkey ==
        // cb->dgram_buf_mr->lkey);
        assert(i == num_of_receives - 1 ||
               ud_c->recv_wr[i].next == &ud_c->recv_wr[i + 1]);
      }
      assert(ud_c->recv_wr[num_of_receives - 1].next == NULL);
    }

    struct ibv_recv_wr* bad_recv_wr;
    int ret = ibv_post_recv(ud_c->qp, ud_c->recv_wr, &bad_recv_wr);
    CPE(ret, "ibv_post_recv error: while posting recvs", ret);

    // recover next ptr of last wr to NULL
    ud_c->recv_wr[num_of_receives - 1].next =
        (ud_c->max_recv_wrs == num_of_receives - 1)
            ? NULL
            : &ud_c->recv_wr[num_of_receives];
  }
}

static inline void
_wings_poll_crds_and_post_recvs(ud_channel_t* ud_c)
{
  if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type == CRD);

  int crd_pkts_found =
      ibv_poll_cq(ud_c->recv_cq, ud_c->max_recv_wrs, ud_c->recv_wc);

  if (crd_pkts_found > 0) {
    if (unlikely(ud_c->recv_wc[crd_pkts_found - 1].status != 0)) {
      fprintf(stderr,
              "Bad wc status when polling for credits to send a broadcast %d\n",
              ud_c->recv_wc[crd_pkts_found - 1].status);
      exit(0);
    }

    if (ud_c->enable_stats) ud_c->stats.recv_total_pkts += crd_pkts_found;

    if (WINGS_ENABLE_RECV_PRINTS && ud_c->enable_prints)
      colored_printf(GREEN, "^^^ Polled reqs: %s  %d, (total: %d)!\n",
                     ud_c->qp_name, crd_pkts_found,
                     ud_c->stats.recv_total_pkts);

    for (int i = 0; i < crd_pkts_found; i++) {
      wings_crd_t* crd_ptr = (wings_crd_t*)&ud_c->recv_wc[i].imm_data;

      if (ud_c->enable_stats) ud_c->stats.recv_total_msgs += crd_ptr->crd_num;
      ud_c->channel_providing_crds->credits_per_channels[crd_ptr->sender_id] +=
          crd_ptr->crd_num;

      if (WINGS_ENABLE_ASSERTIONS)
        assert(ud_c->channel_providing_crds->num_crds_per_channel >=
               ud_c->channel_providing_crds
                   ->credits_per_channels[crd_ptr->sender_id]);

      if (WINGS_ENABLE_CREDIT_PRINTS && ud_c->enable_prints)
        printf(
            "$$$ Credits: %s \033[1m\033[32mincremented\033[0m to %d (for "
            "endpoint %d)\n",
            ud_c->channel_providing_crds->qp_name,
            ud_c->channel_providing_crds
                ->credits_per_channels[crd_ptr->sender_id],
            crd_ptr->sender_id);
    }

    if (WINGS_ENABLE_POST_RECV_PRINTS && ud_c->enable_prints)
      colored_printf(YELLOW, "vvv Post Receives: %s %d\n", ud_c->qp_name,
                     crd_pkts_found);

    _wings_post_hdr_only_recvs(ud_c, (uint16_t)crd_pkts_found);

  } else if (unlikely(crd_pkts_found < 0)) {
    printf("ERROR In the credit CQ\n");
    exit(0);
  }
}

static inline void
_wings_enque_to_overflown_msgs(ud_channel_t* ud_c, uint8_t* msg_ptr)
{
  if (WINGS_ENABLE_ASSERTIONS) {
    assert(ud_c->is_header_only == 0);
    assert(ud_c->enable_overflow_msgs);
    assert(ud_c->num_overflow_msgs < ud_c->max_coalescing);
  }

  uint8_t* dst_ptr =
      &ud_c->overflow_msg_buff[ud_c->num_overflow_msgs * ud_c->max_msg_size];

  memcpy(dst_ptr, msg_ptr, ud_c->max_msg_size);
  ud_c->num_overflow_msgs++;
}

static inline uint16_t
_wings_deque_from_overflown_msgs(ud_channel_t* ud_c, uint16_t max_msgs_to_poll,
                                 uint8_t* recv_ops)
{
  if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->is_header_only == 0);

  uint8_t msgs_to_copy = (uint8_t)(ud_c->num_overflow_msgs <= max_msgs_to_poll
                                       ? ud_c->num_overflow_msgs
                                       : max_msgs_to_poll);

  if (ud_c->num_overflow_msgs > 0) {
    ud_c->num_overflow_msgs -= msgs_to_copy;

    // Copy msgs from overflow_buff to recv_ops
    memcpy(recv_ops, ud_c->overflow_msg_buff,
           msgs_to_copy * ud_c->max_msg_size);

    if (msgs_to_copy == max_msgs_to_poll)
      // Move rest of overflown msgs to the top of the (FIFO) buffer
      for (int i = 0; i < ud_c->num_overflow_msgs; ++i) {
        uint8_t* dst_ptr = &ud_c->overflow_msg_buff[ud_c->max_msg_size * i];
        uint8_t* src_ptr =
            &ud_c->overflow_msg_buff[ud_c->max_msg_size * (i + msgs_to_copy)];
        memcpy(dst_ptr, src_ptr, ud_c->max_msg_size);
      }
  }

  return msgs_to_copy;
}

static inline uint16_t
wings_poll_buff_and_post_recvs(ud_channel_t* ud_c, uint16_t max_msgs_to_poll,
                               uint8_t* recv_ops)
{
  if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type != CRD);

  int index = 0;
  uint8_t sender = 0;
  uint16_t msgs_polled = 0;
  uint8_t *next_packet_reqs, *recv_op_ptr, *next_req, *next_packet_req_num_ptr;

  uint16_t dequed_msgs = 0;
  uint16_t remaining_msgs_to_poll = max_msgs_to_poll;

  if (max_msgs_to_poll < 1) return 0;

  if (ud_c->enable_overflow_msgs) {
    dequed_msgs =
        _wings_deque_from_overflown_msgs(ud_c, max_msgs_to_poll, recv_ops);

    if (max_msgs_to_poll == dequed_msgs) return max_msgs_to_poll;

    recv_ops = &recv_ops[dequed_msgs * ud_c->max_msg_size];
    remaining_msgs_to_poll -= dequed_msgs;
  }

  uint16_t max_pkts_to_poll =
      (uint16_t)((remaining_msgs_to_poll / ud_c->max_coalescing) +
                 (ud_c->enable_overflow_msgs ? 1 : 0));

  // poll completion q
  uint16_t pkts_polled =
      (uint16_t)ibv_poll_cq(ud_c->recv_cq, max_pkts_to_poll, ud_c->recv_wc);

  for (int i = 0; i < pkts_polled; ++i) {
    if (ud_c->is_header_only) {
      recv_op_ptr = &recv_ops[i * ud_c->max_msg_size];
      memcpy(recv_op_ptr, &ud_c->recv_wc[i].imm_data, ud_c->max_msg_size);

      msgs_polled++;

      sender = ((wings_hdr_only_t*)&ud_c->recv_wc[i].imm_data)->sender_id;
      if (!ud_c->disable_crd_ctrl)
        ud_c->channel_providing_crds
            ->credits_per_channels[sender]++;  // increment packet credits

    } else {
      uint16_t max_req_size = _wings_ud_recv_max_pkt_size(ud_c);
      index = (ud_c->recv_pull_ptr + 1) % ud_c->recv_q_depth;
      wings_ud_recv_pkt_t* next_packet =
          (wings_ud_recv_pkt_t*)&ud_c->recv_pkt_buff[index * max_req_size];

      sender = next_packet->pkt.sender_id;
      next_packet_reqs = next_packet->pkt.reqs;
      next_packet_req_num_ptr = &next_packet->pkt.req_num;

      if (WINGS_ENABLE_ASSERTIONS)
        assert(next_packet->pkt.req_num > 0 &&
               next_packet->pkt.req_num <= ud_c->max_coalescing);

      // TODO add membership and functionality
      //        if(node_is_in_membership(last_group_membership, sender))

      uint16_t msg_size = next_packet->pkt.only_small_msgs == 1
                              ? ud_c->small_msg_size
                              : ud_c->max_msg_size;
      for (int j = 0; j < next_packet->pkt.req_num; ++j) {
        next_req = &next_packet_reqs[j * msg_size];

        if (msgs_polled >= remaining_msgs_to_poll)
          _wings_enque_to_overflown_msgs(ud_c, next_req);
        else {
          recv_op_ptr = &recv_ops[msgs_polled * ud_c->max_msg_size];
          memcpy(recv_op_ptr, next_req, msg_size);
        }

        msgs_polled++;
        if (!ud_c->disable_crd_ctrl)
          ud_c->channel_providing_crds
              ->credits_per_channels[sender]++;  // increment packet credits
      }

      *next_packet_req_num_ptr =
          0;  // TODO can be removed since we already reset on posting receives
      _wings_inc_recv_pull_ptr(ud_c);
    }

    if (WINGS_ENABLE_ASSERTIONS)
      if (!ud_c->disable_crd_ctrl)
        assert(ud_c->channel_providing_crds->credits_per_channels[sender] <=
               ud_c->channel_providing_crds->num_crds_per_channel);
  }

  if (pkts_polled > 0) {
    // Refill recvs
    if (ud_c->is_header_only)
      _wings_post_hdr_only_recvs(ud_c, pkts_polled);
    else
      _wings_post_recvs(ud_c, pkts_polled);

    if (WINGS_ENABLE_STAT_COUNTING) {
      ud_c->stats.recv_total_msgs += msgs_polled;
      ud_c->stats.recv_total_pkts += pkts_polled;
    }

    if (WINGS_ENABLE_RECV_PRINTS && ud_c->enable_prints)
      colored_printf(
          GREEN,
          "^^^ Polled msgs: %d packets %s %d, (total pkts: %d, msgs %d)!\n",
          pkts_polled, ud_c->qp_name, msgs_polled, ud_c->stats.recv_total_pkts,
          ud_c->stats.recv_total_msgs);
    if (WINGS_ENABLE_CREDIT_PRINTS && ud_c->enable_prints &&
        !ud_c->disable_crd_ctrl)
      printf(
          "$$$ Credits: %s \033[1m\033[32mincremented\033[0m to %d (for "
          "machine %d)\n",
          ud_c->channel_providing_crds->qp_name,
          ud_c->channel_providing_crds->credits_per_channels[sender], sender);
    if (WINGS_ENABLE_POST_RECV_PRINTS && ud_c->enable_prints)
      colored_printf(YELLOW, "vvv Post Receives: %s %d\n", ud_c->qp_name,
                     pkts_polled);

    if (WINGS_ENABLE_ASSERTIONS)
      assert(ud_c->max_coalescing != 1 || pkts_polled == msgs_polled);
  }

  return msgs_polled + dequed_msgs >= max_msgs_to_poll
             ? max_msgs_to_poll
             : msgs_polled + dequed_msgs;
}

/* ---------------------------------------------------------------------------
----------------------------------- CREDITS ----------------------------------
---------------------------------------------------------------------------*/
static inline uint8_t
_wings_node_is_in_membership(uint8_t node_id, bit_vector_t membership)
{
  if (WINGS_ENABLE_ASSERTIONS) assert(node_id < 8);

  return bv_bit_get(membership, node_id) == 1 ? 1 : 0;
}

// For all the CREDIT functions --> if its a bcast channel endpoint_id is
// ignored
static inline uint8_t
_wings_has_sufficient_crds_no_polling_membership(ud_channel_t* ud_c,
                                                 uint8_t endpoint_id,
                                                 bit_vector_t* membership)
{
  uint8_t check_membership = membership == NULL ? 0 : 1;

  if (ud_c->disable_crd_ctrl)
    return 1;

  else if (!ud_c->is_bcast_channel)
    return (uint8_t)(ud_c->credits_per_channels[endpoint_id] > 0);

  else
    for (int i = 0; i < ud_c->num_channels; ++i) {
      if (i == ud_c->channel_id) continue;
      if (check_membership == 1 &&
          !_wings_node_is_in_membership(i, *membership))
        continue;  // skip machine if not in membership
      if (ud_c->credits_per_channels[i] <= 0) return 0;
    }

  return 1;
}

// For all the CREDIT functions --> if its a bcast channel endpoint_id is
// ignored
static inline uint8_t
_wings_has_sufficient_crds_no_polling(ud_channel_t* ud_c, uint8_t endpoint_id)
{
  return _wings_has_sufficient_crds_no_polling_membership(ud_c, endpoint_id,
                                                          NULL);
}

static inline uint8_t
_wings_has_sufficient_crds_membership(ud_channel_t* ud_c, uint8_t endpoint_id,
                                      bit_vector_t* membership)
{
  if (_wings_has_sufficient_crds_no_polling_membership(ud_c, endpoint_id,
                                                       membership))
    return 1;

  if (ud_c->expl_crd_ctrl) {
    _wings_poll_crds_and_post_recvs(ud_c->channel_providing_crds);

    if (_wings_has_sufficient_crds_no_polling_membership(ud_c, endpoint_id,
                                                         membership))
      return 1;
  }
  return 0;
}

static inline uint8_t
_wings_has_sufficient_crds(ud_channel_t* ud_c, uint8_t endpoint_id)
{
  if (_wings_has_sufficient_crds_no_polling(ud_c, endpoint_id)) return 1;

  if (ud_c->expl_crd_ctrl) {
    _wings_poll_crds_and_post_recvs(ud_c->channel_providing_crds);

    if (_wings_has_sufficient_crds_no_polling(ud_c, endpoint_id)) return 1;
  }
  return 0;
}

static inline void
_wings_dec_crds_membership(ud_channel_t* ud_c, uint8_t endpoint_id,
                           bit_vector_t* membership)
{
  uint8_t check_membership = membership == NULL ? 0 : 1;

  if (ud_c->disable_crd_ctrl) return;

  if (WINGS_ENABLE_ASSERTIONS)
    assert(_wings_has_sufficient_crds_no_polling_membership(ud_c, endpoint_id,
                                                            membership));

  if (!ud_c->is_bcast_channel)
    ud_c->credits_per_channels[endpoint_id]--;
  else
    for (int i = 0; i < ud_c->num_channels; ++i) {
      if (i == ud_c->channel_id) continue;
      if (check_membership == 1 &&
          !_wings_node_is_in_membership(i, *membership))
        continue;  // skip machine if not in membership
      ud_c->credits_per_channels[i]--;
    }

  if (WINGS_ENABLE_CREDIT_PRINTS && ud_c->enable_prints) {
    if (ud_c->is_bcast_channel)
      endpoint_id = (uint8_t)(ud_c->channel_id == 0 ? 1 : 0);

    printf("$$$ Credits: %s \033[31mdecremented\033[0m to %d", ud_c->qp_name,
           ud_c->credits_per_channels[endpoint_id]);

    if (ud_c->is_bcast_channel)
      printf(" (all endpoints)\n");
    else
      printf(" (for endpoint %d)\n", endpoint_id);
  }
}

static inline void
_wings_dec_crds(ud_channel_t* ud_c, uint8_t endpoint_id)
{
  _wings_dec_crds_membership(ud_c, endpoint_id, NULL);
}

static inline void
wings_reset_credits(ud_channel_t* ud_c, uint8_t endpoint_id)
{
  ud_c->credits_per_channels[endpoint_id] =
      (uint16_t)ud_c->channel_providing_crds->num_crds_per_channel;
}

/* ---------------------------------------------------------------------------
----------------------------------- SENDs ------------------------------------
---------------------------------------------------------------------------*/
static inline void
_wings_forge_crd_wr(ud_channel_t* ud_c, uint16_t dst_qp_id,
                    uint16_t crd_pkts_to_send, uint16_t crd_to_send)
{
  if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type == CRD);

  ud_c->send_wr[crd_pkts_to_send].send_flags = IBV_SEND_INLINE;
  ud_c->send_wr[crd_pkts_to_send].wr.ud.ah = ud_c->remote_qps[dst_qp_id].ah;
  ud_c->send_wr[crd_pkts_to_send].wr.ud.remote_qpn =
      ud_c->remote_qps[dst_qp_id].qpn;

  ((wings_crd_t*)&ud_c->send_wr[crd_pkts_to_send].imm_data)->crd_num =
      crd_to_send;

  if (ud_c->enable_stats) ud_c->stats.send_total_msgs += crd_to_send;

  if (crd_pkts_to_send > 0)
    ud_c->send_wr[crd_pkts_to_send - 1].next = &ud_c->send_wr[crd_pkts_to_send];

  // Selective Signaling --> Do a Signaled Send every ss_granularity pkts
  if (ud_c->total_pkts_send % ud_c->ss_granularity == 0) {
    // if not the first SS --> poll the previous SS completion
    if (ud_c->total_pkts_send > 0) {
      struct ibv_wc signal_send_wc;
      hrd_poll_cq(ud_c->send_cq, 1, &signal_send_wc);

      if (ud_c->enable_stats) ud_c->stats.ss_completions++;

      if (WINGS_ENABLE_SS_PRINTS && ud_c->enable_prints)
        colored_printf(RED, "^^^ Polled SS completion: %s %d (total %d)\n",
                       ud_c->qp_name, 1, ud_c->stats.ss_completions);
    }

    ud_c->send_wr[crd_pkts_to_send].send_flags |= IBV_SEND_SIGNALED;
    if (WINGS_ENABLE_SS_PRINTS && ud_c->enable_prints)
      colored_printf(RED, "vvv Send SS: %s\n", ud_c->qp_name);
  }
  ud_c->total_pkts_send++;
}

static inline void
_wings_forge_wr(ud_channel_t* ud_c, uint8_t dst_qp_id, uint8_t* req_to_copy,
                uint16_t pkts_in_batch, uint16_t* msgs_in_batch,
                copy_and_modify_input_elem_t copy_and_modify_elem,
                uint8_t is_small_msg)
// dst_qp_id is ignored if its a bcast channel
{
  struct ibv_wc signal_send_wc;

  uint8_t curr_req_num = 1;
  uint8_t* next_req_ptr;

  if (ud_c->is_header_only)
    next_req_ptr = ((wings_hdr_only_t*)&ud_c->send_wr[pkts_in_batch].imm_data)
                       ->inlined_payload;
  else {
    wings_ud_send_pkt_t* curr_pkt_ptr = _wings_curr_send_pkt_ptr(ud_c);
    next_req_ptr = _wings_get_n_msg_ptr_from_send_pkt(ud_c, curr_pkt_ptr,
                                                      curr_pkt_ptr->req_num);
    curr_req_num = ++curr_pkt_ptr->req_num;
    curr_pkt_ptr->sender_id = ud_c->channel_id;
    uint16_t msg_size =
        is_small_msg == 1 ? ud_c->small_msg_size : ud_c->max_msg_size;
    ud_c->send_sgl[pkts_in_batch].length =
        sizeof(wings_ud_send_pkt_t) +
        //                                               ud_c->max_msg_size *
        //                                               curr_pkt_ptr->req_num;
        msg_size * curr_pkt_ptr->req_num;
    if (WINGS_ENABLE_ASSERTIONS)
      assert(is_small_msg == 1 ||
             curr_req_num == 1);  // we only do coalescing for small msgs

    if (curr_req_num == 1) {
      ud_c->send_sgl[pkts_in_batch].addr = (uint64_t)curr_pkt_ptr;
#if WINGS_ENABLE_TWO_MSG_SIZES == 1
      curr_pkt_ptr->only_small_msgs = is_small_msg == 1 ? 1 : 0;
#endif
    }
  }

  //<Copy & modify elem!> --> callback func that copies and manipulated data
  // from req_to_copy buff
  copy_and_modify_elem(next_req_ptr, req_to_copy);

  if (WINGS_ENABLE_ASSERTIONS) {
    assert(dst_qp_id != machine_id || ud_c->is_bcast_channel);
    assert(curr_req_num <= ud_c->max_coalescing);
  }

  if (ud_c->enable_stats) ud_c->stats.send_total_msgs++;

  if (curr_req_num == 1) {
    if (!ud_c->is_bcast_channel) {  // set the dst qp
      ud_c->send_wr[pkts_in_batch].wr.ud.ah = ud_c->remote_qps[dst_qp_id].ah;
      ud_c->send_wr[pkts_in_batch].wr.ud.remote_qpn =
          ud_c->remote_qps[dst_qp_id].qpn;
    }

    uint16_t wr_idx =
        (uint16_t)(pkts_in_batch *
                   (ud_c->is_bcast_channel ? ud_c->num_channels - 1 : 1));
    ud_c->send_wr[wr_idx].send_flags =
        ud_c->is_inlining_enabled ? IBV_SEND_INLINE : 0;

    if (wr_idx > 0)  // set previous send_wr to point to curr
      ud_c->send_wr[wr_idx - 1].next = &ud_c->send_wr[wr_idx];

    // Selective Signaling --> Do a Signaled Send every ss_granularity pkts
    if (ud_c->total_pkts_send % ud_c->ss_granularity == 0) {
      // if not the first SS --> poll the previous SS completion
      if (ud_c->total_pkts_send > 0) {
        hrd_poll_cq(ud_c->send_cq, 1, &signal_send_wc);

        if (ud_c->enable_stats) ud_c->stats.ss_completions++;

        if (WINGS_ENABLE_SS_PRINTS && ud_c->enable_prints)
          colored_printf(RED, "^^^ Polled SS completion: %s %d (total %d)\n",
                         ud_c->qp_name, 1, ud_c->stats.ss_completions);
      }

      ud_c->send_wr[wr_idx].send_flags |= IBV_SEND_SIGNALED;
      if (WINGS_ENABLE_SS_PRINTS && ud_c->enable_prints)
        colored_printf(RED, "vvv Send SS: %s\n", ud_c->qp_name);
    }
    ud_c->total_pkts_send++;
  }

  (*msgs_in_batch)++;
}

static inline void
_wings_batch_pkts_2_NIC(ud_channel_t* ud_c, uint16_t pkts_in_batch,
                        uint16_t msgs_in_batch)
{
  int ret;
  struct ibv_send_wr* bad_send_wr;

  if (ud_c->enable_stats) ud_c->stats.send_total_pkts += pkts_in_batch;

  uint16_t remote_channels = (uint16_t)(ud_c->num_channels - 1);
  uint16_t wr_idx = (uint16_t)(pkts_in_batch *
                               (ud_c->is_bcast_channel ? remote_channels : 1));
  ud_c->send_wr[wr_idx - 1].next = NULL;

  if (WINGS_ENABLE_ASSERTIONS) {
    assert(pkts_in_batch <= ud_c->max_send_wrs);
    assert(pkts_in_batch <= ud_c->send_pkt_buff_len);
    assert(ud_c->type == CRD || ud_c->max_coalescing > 1 ||
           msgs_in_batch == pkts_in_batch);
    assert(ud_c->type == CRD || ud_c->max_coalescing > 1 ||
           ud_c->stats.send_total_msgs == ud_c->stats.send_total_pkts);

    assert(ud_c->send_wr[wr_idx - 1].next == NULL);
    for (int i = 0; i < wr_idx; ++i) {
      uint16_t sgl_idx =
          (uint16_t)(i / (ud_c->is_bcast_channel ? remote_channels : 1));

      if (ud_c->type != CRD && !ud_c->is_header_only) {
        assert(ud_c->send_wr[i].num_sge == 1);
        assert(ud_c->send_wr[i].opcode == IBV_WR_SEND);
        assert(ud_c->send_wr[i].sg_list == &ud_c->send_sgl[sgl_idx]);

        wings_ud_send_pkt_t* curr_send_pkt =
            (wings_ud_send_pkt_t*)ud_c->send_sgl[sgl_idx].addr;
        assert(curr_send_pkt->req_num > 0);
      } else {
        assert(ud_c->send_wr[i].num_sge == 0);
        assert(ud_c->send_wr[i].sg_list->length == 0);
        assert(ud_c->send_wr[i].opcode == IBV_WR_SEND_WITH_IMM);
        if (ud_c->type == CRD) {
          assert(((wings_crd_t*)&(ud_c->send_wr[i].imm_data))->crd_num > 0);
          assert(((wings_crd_t*)&(ud_c->send_wr[i].imm_data))->sender_id ==
                 ud_c->channel_id);
        } else
          assert(((wings_hdr_only_t*)&(ud_c->send_wr[i].imm_data))->sender_id ==
                 ud_c->channel_id);
      }

      assert(ud_c->send_wr[i].wr.ud.remote_qkey == HRD_DEFAULT_QKEY);
      assert(i == wr_idx - 1 || ud_c->send_wr[i].next == &ud_c->send_wr[i + 1]);
      assert(!ud_c->is_inlining_enabled ||
             ud_c->send_wr[i].send_flags == IBV_SEND_INLINE ||
             ud_c->send_wr[i].send_flags ==
                 (IBV_SEND_INLINE | IBV_SEND_SIGNALED));
    }
  }

  if (WINGS_ENABLE_SEND_PRINTS &&
      ud_c->enable_prints)  // TODO make this work w/ bcasts
    colored_printf(CYAN,
                   ">>> Send: %d packets %s %d (Total packets: %d, msgs: %d)\n",
                   pkts_in_batch, ud_c->qp_name, msgs_in_batch,
                   ud_c->stats.send_total_pkts, ud_c->stats.send_total_msgs);

  ret = ibv_post_send(ud_c->qp, ud_c->send_wr, &bad_send_wr);
  CPE(ret, "ibv_post_send error while sending msgs to the NIC", ret);
}

static inline void
_wings_check_if_batch_n_inc_pkt_ptr(ud_channel_t* ud_c,
                                    uint16_t* pkts_in_batch_ptr,
                                    uint16_t* msgs_in_batch_ptr)
{
  (*pkts_in_batch_ptr)++;
  uint16_t send_pkts = *pkts_in_batch_ptr;
  uint16_t total_msgs_in_batch = *msgs_in_batch_ptr;
  uint16_t max_pkt_batch =
      ud_c->is_bcast_channel ? ud_c->max_pcie_bcast_batch : ud_c->max_send_wrs;

  if (send_pkts == max_pkt_batch) {
    _wings_batch_pkts_2_NIC(ud_c, send_pkts, total_msgs_in_batch);
    *pkts_in_batch_ptr = 0;
    *msgs_in_batch_ptr = 0;
  }

  _wings_inc_send_push_ptr(ud_c);  // go to the next pkt
}

static inline uint8_t
wings_set_sender_id_n_msg_type(uint8_t sender_id, uint8_t is_small_msg)
{
  if (WINGS_ENABLE_ASSERTIONS) {
    assert(sender_id < 128);
    assert(is_small_msg == 0 || is_small_msg == 1);
  }
  return (is_small_msg == 0) ? sender_id + 128 : sender_id;
}

static inline uint8_t
_wings_get_sender_id_n_msg_type(uint8_t skip_or_sender_id,
                                uint8_t* is_small_msg)
{
  if (WINGS_ENABLE_ASSERTIONS) assert(skip_or_sender_id < 258);
  *is_small_msg = (skip_or_sender_id >= 128) ? 0 : 1;
  return (skip_or_sender_id >= 128) ? skip_or_sender_id - 128
                                    : skip_or_sender_id;
}

static inline uint8_t
wings_issue_pkts(ud_channel_t* ud_c, bit_vector_t* membership,
                 uint8_t* input_array_of_elems, uint16_t input_array_len,
                 uint16_t size_of_input_elems,
                 uint16_t* input_array_rolling_idx,
                 skip_input_elem_or_get_dst_id_t skip_or_get_sender_id_func_ptr,
                 modify_input_elem_after_send_t modify_elem_after_send,
                 copy_and_modify_input_elem_t copy_and_modify_elem)
{
  uint8_t curr_msg_dst;
  uint8_t is_small_msg = 0;
  uint8_t last_msg_dst = 255;
  uint8_t has_outstanding_msgs = 0;
  uint16_t msgs_in_batch = 0, pkts_in_batch = 0, idx = 0;

  if (WINGS_ENABLE_ASSERTIONS)
    assert(ud_c->is_header_only ||
           _wings_curr_send_pkt_ptr(ud_c)->req_num == 0);

  for (int i = 0; i < input_array_len; i++) {
    idx = (uint16_t)(input_array_rolling_idx == NULL
                         ? i
                         : (i + *input_array_rolling_idx) % input_array_len);

    // Skip or Respond (copy and send ?)
    uint8_t* curr_elem = &input_array_of_elems[idx * size_of_input_elems];
    int skip_or_sender_id = skip_or_get_sender_id_func_ptr(curr_elem);
    if (skip_or_sender_id < 0) continue;

    if (WINGS_ENABLE_ASSERTIONS) assert(skip_or_sender_id < 258);

    curr_msg_dst =
        _wings_get_sender_id_n_msg_type(skip_or_sender_id, &is_small_msg);
    if (ud_c->is_header_only) is_small_msg = 1;

    // Break if we do not have sufficient credits
    if (!_wings_has_sufficient_crds_membership(ud_c, curr_msg_dst,
                                               membership)) {
      has_outstanding_msgs = 1;
      if (ud_c->enable_stats) ud_c->stats.no_stalls_due_to_credits++;

      if (input_array_rolling_idx != NULL) *input_array_rolling_idx = idx;
      break;  // we need to break for broadcast (lets assume it is ok to break
              // for unicasts as well since it may only harm perf)
    }

    _wings_dec_crds_membership(ud_c, curr_msg_dst, membership);

    if ((!ud_c->is_bcast_channel && !ud_c->is_header_only) ||
        is_small_msg == 0) {
      // Send unicasts because if we cannot coalesce pkts, due to different
      // endpoints
      if (_wings_curr_send_pkt_ptr(ud_c)->req_num > 0 &&
          (is_small_msg == 0 || curr_msg_dst != last_msg_dst))
        _wings_check_if_batch_n_inc_pkt_ptr(ud_c, &pkts_in_batch,
                                            &msgs_in_batch);
    }

    last_msg_dst = curr_msg_dst;

    // Create the messages
    _wings_forge_wr(ud_c, curr_msg_dst, curr_elem, pkts_in_batch,
                    &msgs_in_batch, copy_and_modify_elem, is_small_msg);

    modify_elem_after_send(curr_elem);  // E.g. Change the state of the element
                                        // which triggered a send

    // Check if we should send a batch since we might have reached the max batch
    // size
    if (is_small_msg == 0 || ud_c->is_header_only ||
        _wings_curr_send_pkt_ptr(ud_c)->req_num == ud_c->max_coalescing) {
      _wings_check_if_batch_n_inc_pkt_ptr(ud_c, &pkts_in_batch, &msgs_in_batch);
    }
  }

  // Even if the last pkt is not full do the appropriate actions and incl to NIC
  // batch
  wings_ud_send_pkt_t* curr_pkt_ptr = NULL;

  if (!ud_c->is_header_only && is_small_msg == 1) {
    curr_pkt_ptr = _wings_curr_send_pkt_ptr(ud_c);
    if (curr_pkt_ptr->req_num > 0 &&
        curr_pkt_ptr->req_num < ud_c->max_coalescing)
      pkts_in_batch++;
  }

  // Force a batch to send the last set of requests (even < max batch size)
  if (pkts_in_batch > 0)
    _wings_batch_pkts_2_NIC(ud_c, pkts_in_batch, msgs_in_batch);

  if (!ud_c->is_header_only && is_small_msg == 1)
    // Move to next packet and reset data left from previous bcasts/unicasts
    if (curr_pkt_ptr->req_num > 0 &&
        curr_pkt_ptr->req_num < ud_c->max_coalescing)
      _wings_inc_send_push_ptr(ud_c);

  return has_outstanding_msgs;
}

static inline void
wings_issue_credits(
    ud_channel_t* ud_c, bit_vector_t* membership, uint8_t* input_array_of_elems,
    uint16_t input_array_len, uint16_t size_of_input_elems,
    skip_input_elem_or_get_dst_id_t skip_or_get_sender_id_func_ptr,
    modify_input_elem_after_send_t modify_elem_after_send)
{
  if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type == CRD);

  for (int i = 0; i < ud_c->num_channels; ++i)
    ud_c->no_crds_to_send_per_endpoint[i] = 0;

  for (int i = 0; i < input_array_len; ++i) {
    // Skip or Respond (copy and send ?)
    uint8_t* curr_elem = &input_array_of_elems[i * size_of_input_elems];
    int skip_or_sender_id = skip_or_get_sender_id_func_ptr(curr_elem);
    if (WINGS_ENABLE_ASSERTIONS) assert(skip_or_sender_id < 255);

    if (skip_or_sender_id < 0) continue;
    uint8_t curr_msg_dst = (uint8_t)skip_or_sender_id;

    // Check if we have sufficient credits --> (we should always have enough
    // credits for CRDs)
    if (!_wings_has_sufficient_crds_membership(ud_c, curr_msg_dst, membership))
      assert(0);
    if (ud_c->no_crds_to_send_per_endpoint[curr_msg_dst] == 0 &&
        ud_c->credits_per_channels[curr_msg_dst] == 0)
      assert(0);

    _wings_dec_crds_membership(ud_c, curr_msg_dst, membership);

    ud_c->no_crds_to_send_per_endpoint[curr_msg_dst]++;

    modify_elem_after_send(curr_elem);  // E.g. Change the state of the element
                                        // which triggered a send
  }

  uint16_t send_crd_packets = 0, total_credits_to_send = 0;
  for (uint16_t i = 0; i < ud_c->num_channels; ++i) {
    if (i == ud_c->channel_id) continue;

    if (ud_c->no_crds_to_send_per_endpoint[i] > 0) {
      _wings_forge_crd_wr(ud_c, i, send_crd_packets,
                          ud_c->no_crds_to_send_per_endpoint[i]);
      send_crd_packets++;
      total_credits_to_send += ud_c->no_crds_to_send_per_endpoint[i];

      if (send_crd_packets == ud_c->max_send_wrs) {
        _wings_batch_pkts_2_NIC(ud_c, send_crd_packets, total_credits_to_send);
        send_crd_packets = 0;
        total_credits_to_send = 0;
      }
    }
  }

  if (send_crd_packets > 0)
    _wings_batch_pkts_2_NIC(ud_c, send_crd_packets, total_credits_to_send);
}

#endif  // WINGS_INTERNAL_INLINES_H


================================================
FILE: include/wings/wings_api.h
================================================
//
// Created by akatsarakis on 06/02/19.
//

#ifndef WINGS_API_H
#define WINGS_API_H
#include "../utils/bit_vector.h"
#include "hrd.h"

/// WARNING!!
/// 	Accessible functions not defined below (in wings_api.h but exist only in
/// wings.h) and starting with underscore
///		(i.e. "_wings_*") are internal and should not be called directly
/// by the application

#define WINGS_ENABLE_ASSERTIONS 0
#define WINGS_MAX_SUPPORTED_INLINING 187
#define WINGS_ENABLE_BATCH_POST_RECVS_TO_NIC 1

#define WINGS_ENABLE_STAT_COUNTING 1

#define WINGS_MIN_PCIE_BCAST_BATCH 1
#define WINGS_MIN(x, y) (x < y ? x : y)

#define WINGS_ENABLE_PRINTS 0
#define WINGS_ENABLE_SS_PRINTS (1 && WINGS_ENABLE_PRINTS)
#define WINGS_ENABLE_SEND_PRINTS (1 && WINGS_ENABLE_PRINTS)
#define WINGS_ENABLE_RECV_PRINTS (1 && WINGS_ENABLE_PRINTS)
#define WINGS_ENABLE_CREDIT_PRINTS (1 && WINGS_ENABLE_PRINTS)
#define WINGS_ENABLE_POST_RECV_PRINTS (1 && WINGS_ENABLE_PRINTS)

#define WINGS_IS_ROCE 0
#define MAX_MTU_SIZE 4096

/* Useful when `x = (x + 1) % N` is done in a loop */
#define WINGS_MOD_ADD(x, N) \
  do {                      \
    x = x + 1;              \
    if (x == N) x = 0;      \
  } while (0)

/* ah pointer and qpn are accessed together in the critical path
   so we are putting them in the same cache line */
typedef struct {
  struct ibv_ah* ah;
  uint32_t qpn;
  // no padding needed- false sharing is not an issue, only fragmentation
} qp_info_t;

typedef struct {
  uint8_t only_small_msgs : 1;  // support for up to 256 unique senders per
                                // instance (e.g. thread)
  uint8_t sender_id : 7;  // support for up to 128 unique senders per instance
                          // (e.g. thread)
  uint8_t req_num;        // <= max_coalescing of a channel
  uint8_t reqs[];         // sizeof(req_num * req_size)
} wings_pkt_t, wings_ud_send_pkt_t;

// Packets with GRH
typedef struct {
  struct ibv_grh grh;
  wings_pkt_t pkt;
} __attribute__((packed))
wings_ud_recv_pkt_t;  // rcved rdma ud pkts come with a grh padding

typedef struct {
  uint8_t sender_id;  // support for up to 256 unique senders per instance (e.g.
                      // thread)
  uint16_t crd_num;   // credit num
} __attribute__((packed)) wings_crd_t;  // always send as inlined_payload

typedef struct {
  uint8_t sender_id;  // support for up to 256 unique senders per instance (e.g.
                      // thread)
  uint8_t inlined_payload[3];  // available space to be used by the application
} __attribute__((packed)) wings_hdr_only_t;  // always send as inlined_payload

static_assert(sizeof(wings_hdr_only_t) == 4 * sizeof(uint8_t), "");

typedef struct {
  uint64_t send_total_msgs;
  uint64_t send_total_pkts;
  uint64_t send_total_pcie_batches;

  uint64_t ss_completions;
  uint64_t recv_total_msgs;
  uint64_t recv_total_pkts;

  uint64_t
      no_stalls_due_to_credits;  // number of stalls due to not enough credits
} ud_channel_stats_t;

enum channel_type { REQ, RESP, CRD };

typedef struct _ud_channel_t {
  struct ibv_qp* qp;

  enum channel_type type;
  uint8_t max_coalescing;
  uint8_t expl_crd_ctrl;
  uint8_t disable_crd_ctrl;
  uint8_t is_header_only;
  uint8_t is_bcast_channel;
  uint8_t is_inlining_enabled;
  struct _ud_channel_t* channel_providing_crds;

  char* qp_name;
  uint16_t qp_id;  // id of qp in cb
  uint16_t max_msg_size;
  uint16_t small_msg_size;

  uint8_t channel_id;     // id of the curr channel (e.g. local node id)
  uint16_t num_channels;  // e.g. remote nodes + local node
  uint16_t num_crds_per_channel;
  uint16_t* credits_per_channels;  // array size of num_channels denoting
                                   // available space on remote sides
  /// Credits refer to msgs irrespective if coalesed or not --> a remote buffer
  /// must be able to handle max_number_of_msgs * max_coalescing

  volatile uint8_t* recv_pkt_buff;  /// Intermediate buffs where reqs are copied
                                    /// when pkts are received
  wings_ud_send_pkt_t* send_pkt_buff;  /// Intermediate buffs where reqs are
                                       /// copied when pkts are send

  uint16_t send_pkt_buff_len;
  uint16_t recv_pkt_buff_len;

  uint16_t max_send_wrs;
  uint16_t max_recv_wrs;

  uint16_t send_q_depth;
  uint16_t recv_q_depth;

  uint16_t ss_granularity;  // selective signaling granularity
  uint16_t max_pcie_bcast_batch;

  uint64_t total_pkts_send;  // used for selective signaling

  int send_push_ptr;
  int recv_push_ptr;
  int recv_pull_ptr;

  struct ibv_send_wr* send_wr;
  struct ibv_recv_wr* recv_wr;  // Used only to batch post recvs to the NIC

  struct ibv_sge* send_sgl;
  struct ibv_sge* recv_sgl;  // Used only to batch post recvs to the NIC

  struct ibv_cq* send_cq;
  struct ibv_cq* recv_cq;
  struct ibv_wc* recv_wc;  // (size of max_recv_wrs) Used on polling recv req cq
                           // (only for immediates)

  /// Send wcs are omitted since they are only used for selective signaling
  /// (within send function calls)

  struct ibv_mr* send_mem_region;  // NULL if inlining is enabled

  struct ibv_pd* pd;  // A protection domain for this ud channel

  // Remote QPs
  qp_info_t* remote_qps;

  // Used only for type == CRD
  uint16_t* no_crds_to_send_per_endpoint;

  // Stats
  ud_channel_stats_t stats;

  uint8_t enable_overflow_msgs;
  uint8_t num_overflow_msgs;   // msgs in overflow_msg_buff always <=
                               // max_coalescing - 1
  uint8_t* overflow_msg_buff;  // use to keep message in case of polling
                               // a pkt and it doesn't fit in the recv array we

  // Toggles
  uint8_t enable_stats;
  uint8_t enable_prints;
} ud_channel_t;

// Define some function pointers used when issuing pkts
typedef void (*modify_input_elem_after_send_t)(uint8_t*);
typedef int (*skip_input_elem_or_get_dst_id_t)(
    uint8_t*);  // Should return -1 to skip otherwise returns the sender id
typedef void (*copy_and_modify_input_elem_t)(uint8_t* msg_to_send,
                                             uint8_t* triggering_req);

static inline void
wings_NOP_modify_elem_after_send(uint8_t* req)
{ /*Do not change anything*/
}

/// Init and Util functions
void wings_print_ud_c_overview(ud_channel_t* ud_c);

void wings_ud_channel_destroy(
    ud_channel_t* ud_c);  // This must be used to destroy all ud_c (both CRD and
                          // typical ud_c)

// This is used to int only non-CRDs channels (CRDs are initialized internally)
void wings_ud_channel_init(ud_channel_t* ud_c, char* qp_name,
                           enum channel_type type, uint8_t max_coalescing,
                           uint16_t max_req_size, uint16_t small_req_size,
                           uint8_t enable_inlining, uint8_t is_header_only,
                           uint8_t is_bcast,
                           // Credits
                           uint8_t disable_crd_ctrl, uint8_t expl_crd_ctrl,
                           ud_channel_t* linked_channel,
                           uint16_t crds_per_channel, uint16_t num_channels,
                           uint8_t channel_id,
                           // Toggles
                           uint8_t stats_on, uint8_t prints_on);

void wings_setup_channel_qps_and_recvs(ud_channel_t** ud_c_array,
                                       uint16_t ud_c_num,
                                       dbit_vector_t* shared_rdy_var,
                                       uint16_t worker_lid);

/// Main functions
static inline uint16_t wings_poll_buff_and_post_recvs(ud_channel_t* ud_c,
                                                      uint16_t max_pkts_to_poll,
                                                      uint8_t* recv_buff_space);

static inline uint8_t wings_issue_pkts(
    ud_channel_t* ud_c, bit_vector_t* membership, uint8_t* input_array_of_elems,
    uint16_t input_array_len, uint16_t size_of_input_elems,
    uint16_t* input_array_rolling_idx,
    skip_input_elem_or_get_dst_id_t skip_or_get_sender_id_func_ptr,
    modify_input_elem_after_send_t modify_elem_after_send,
    copy_and_modify_input_elem_t copy_and_modify_elem);

static inline void wings_issue_credits(
    ud_channel_t* ud_c, bit_vector_t* membership, uint8_t* input_array_of_elems,
    uint16_t input_array_len, uint16_t size_of_input_elems,
    skip_input_elem_or_get_dst_id_t skip_or_get_sender_id_func_ptr,
    modify_input_elem_after_send_t modify_elem_after_send);

#endif  // WINGS_API_H


================================================
FILE: src/CR/crKV.c
================================================
//
// Created by akatsarakis on 07/03/19.
//

#include <spacetime.h>
#include <util.h>

//////////////////////////////////////////////////
////////////////////  Chain Replication / CRAQ KVS
//////////////////////////////////////////////////

//////////////////////////////////////////////////
//////////// Helper functions ////////////////////
static inline uint8_t
head_id()
{
  return 0;
}

static inline uint8_t
tail_id()
{
  return machine_num - 1;
}

//////////// Assertion functions
static inline void
cr_assertions_inv(spacetime_inv_t* inv_ptr)
{
  assert(inv_ptr->op_meta.ts.version % 2 == 0);
  assert(inv_ptr->op_meta.opcode == ST_OP_INV ||
         inv_ptr->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE);
  assert(inv_ptr->op_meta.val_len == ST_VALUE_SIZE);
}

//////////// Skip functions

static inline uint8_t
cr_skip_op(spacetime_op_t* op_ptr)
{
  return (uint8_t)((op_ptr->op_meta.state == ST_PUT_SUCCESS ||
                    op_ptr->op_meta.state == ST_IN_PROGRESS_GET ||
                    op_ptr->op_meta.state == ST_IN_PROGRESS_PUT)
                       ? 1
                       : 0);
}

static inline uint8_t
cr_skip_inv(spacetime_inv_t* inv_ptr)
{
  return (uint8_t)(inv_ptr->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE ? 1 : 0);
}

static inline uint8_t
cr_skip_ack(spacetime_ack_t* ack_ptr)
{
  return (uint8_t)(ack_ptr->opcode == ST_OP_MEMBERSHIP_CHANGE ? 1 : 0);
}

static inline uint8_t
cr_skip_remote_reads(spacetime_op_t* op_ptr)
{
  return (uint8_t)((op_ptr->op_meta.state == ST_EMPTY) ? 1 : 0);
}

static inline uint8_t
cr_skip_remote_writes(spacetime_op_t* op_ptr)
{
  return (uint8_t)((op_ptr->op_meta.state == ST_EMPTY ||
                    op_ptr->op_meta.state == ST_PUT_SUCCESS ||
                    op_ptr->op_meta.state == ST_IN_PROGRESS_PUT)
                       ? 1
                       : 0);
}

//////////// Exec functions
static inline void
cr_exec_write(spacetime_op_t* op_ptr, struct mica_op* kv_ptr)
{
  spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;
  uint8_t* kv_value_ptr = (uint8_t*)&curr_meta[1];

  if (ENABLE_ASSERTIONS) {
    assert(machine_id == head_id());  // Only head must exec writes
    assert(op_ptr->op_meta.opcode == ST_OP_PUT);
    assert(op_ptr->op_meta.val_len == ST_VALUE_SIZE);
  }

  op_ptr->op_meta.state = ST_EMPTY;

  cctrl_lock(&curr_meta->cctrl);
  switch (curr_meta->state) {
    case INVALID_STATE:
      // Do not initiate a new write until you get to valid state
      if (CR_ENABLE_BLOCKING_INVALID_WRITES_ON_HEAD) {
        cctrl_unlock_dec_version(&curr_meta->cctrl);
        op_ptr->op_meta.state = ST_PUT_STALL;
        break;
      }
    case VALID_STATE:
      curr_meta->state = INVALID_STATE;
      memcpy(kv_value_ptr, op_ptr->value, ST_VALUE_SIZE);
      kv_ptr->val_len = op_ptr->op_meta.val_len + sizeof(spacetime_object_meta);

      cctrl_unlock_inc_version(&curr_meta->cctrl, (uint8_t)machine_id,
                               (uint32_t*)&(op_ptr->op_meta.ts.version));

      op_ptr->op_meta.state = ST_PUT_SUCCESS;
      op_ptr->op_meta.ts.tie_breaker_id = (uint8_t)machine_id;
      break;
    default:
      assert(0);
  }
}

static inline void
cr_exec_remote_reads(spacetime_op_t* op_ptr, struct mica_op* kv_ptr)
{
  if (ENABLE_ASSERTIONS) {
    assert(machine_id == tail_id());
    assert(op_ptr->op_meta.opcode == ST_OP_GET);
  }

  // the following variables used to validate atomicity between a lock-free read
  // of an object
  spacetime_object_meta prev_meta;
  spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;
  uint8_t* kv_value_ptr = (uint8_t*)&curr_meta[1];

  do {
    prev_meta = *curr_meta;
    // switch template with all states
    switch (curr_meta->state) {
      case VALID_STATE:
        memcpy(op_ptr->value, kv_value_ptr, ST_VALUE_SIZE);
        op_ptr->op_meta.state = ST_GET_COMPLETE;
        op_ptr->op_meta.val_len =
            kv_ptr->val_len - sizeof(spacetime_object_meta);
        break;
      case INVALID_STATE:
      default:
        assert(0);
    }
  } while (
      !cctrl_timestamp_is_same_and_valid(&prev_meta.cctrl, &curr_meta->cctrl));
}

static inline void
cr_exec_op(spacetime_op_t* op_ptr, struct mica_op* kv_ptr, uint8_t idx)
{
  if (ENABLE_ASSERTIONS) assert(idx < max_batch_size);

  // the following variables used to validate atomicity between a lock-free read
  // of an object
  spacetime_object_meta prev_meta;
  spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;
  uint8_t* kv_value_ptr = (uint8_t*)&curr_meta[1];

  if (op_ptr->op_meta.opcode == ST_OP_GET) {
    // Lock free reads through versioning (successful when version is even)
    op_ptr->op_meta.state = ST_EMPTY;

    do {
      prev_meta = *curr_meta;
      // switch template with all states
      switch (curr_meta->state) {
        case VALID_STATE:
          memcpy(op_ptr->value, kv_value_ptr, ST_VALUE_SIZE);
          op_ptr->op_meta.state = ST_GET_COMPLETE;
          op_ptr->op_meta.val_len =
              kv_ptr->val_len - sizeof(spacetime_object_meta);
          break;
        case INVALID_STATE:
          if (ENABLE_ASSERTIONS)
            assert(machine_id != tail_id());  // tail should always be valid
          op_ptr->op_meta.state = ST_GET_STALL;
          break;
        default:
          assert(0);
      }
    } while (!cctrl_timestamp_is_same_and_valid(&prev_meta.cctrl,
                                                &curr_meta->cctrl));

    if (op_ptr->op_meta.state == ST_GET_STALL) op_ptr->buff_idx = idx;

  }

  else if (op_ptr->op_meta.opcode == ST_OP_PUT) {
    if (machine_id == head_id())  // if it is head
      cr_exec_write(op_ptr, kv_ptr);
    else
      op_ptr->op_meta.state = ST_PUT_SUCCESS;

    if (op_ptr->op_meta.state == ST_PUT_SUCCESS)
      // Set idx that we cannot set while dispatching the req
      op_ptr->buff_idx = idx;
  }
}

static inline void
cr_complete_local_write(spacetime_op_t* read_write_op, uint8_t idx,
                        const uint64_t* key)
{
  /// completed read / write --> remove it from the ops buffer
  if (ENABLE_ASSERTIONS) {
    assert(read_write_op[idx].op_meta.state == ST_IN_PROGRESS_PUT);
    assert(((uint64_t*)&read_write_op[idx].op_meta.key)[0] == key[0]);
  }

  if (read_write_op[idx].op_meta.opcode == ST_OP_PUT)
    read_write_op[idx].op_meta.state = ST_PUT_COMPLETE;
  else
    assert(0);
}

static inline void
cr_exec_inv(spacetime_inv_t* inv_ptr, struct mica_op* kv_ptr,
            spacetime_op_t* read_write_op)
{
  // the following variables used to validate atomicity between a lock-free read
  // of an object
  spacetime_object_meta lock_free_meta;
  spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;
  uint8_t* kv_value_ptr = (uint8_t*)&curr_meta[1];
  if (ENABLE_ASSERTIONS) assert(inv_ptr->op_meta.opcode == ST_OP_INV);

  uint32_t debug_cntr = 0;
  do {  // Lock free read of keys meta
    if (ENABLE_ASSERTIONS) {
      debug_cntr++;
      if (debug_cntr == M_4) {
        printf("Worker stuck on a lock-free read (for INV)\n");
        debug_cntr = 0;
      }
    }
    lock_free_meta = *curr_meta;
  } while (!cctrl_timestamp_is_same_and_valid(&lock_free_meta.cctrl,
                                              &curr_meta->cctrl));

  // lock and proceed iff remote.TS >= local.TS
  // inv TS >= local timestamp
  if (!timestamp_is_smaller(inv_ptr->op_meta.ts.version,
                            inv_ptr->op_meta.ts.tie_breaker_id,
                            lock_free_meta.cctrl.ts.version,
                            lock_free_meta.cctrl.ts.tie_breaker_id)) {
    // Lock and check again if inv TS > local timestamp
    cctrl_lock(&curr_meta->cctrl);
    /// Warning: use op.version + 1 bellow since optik_lock() increases
    /// curr_meta->version by 1
    if (timestamp_is_smaller(
            curr_meta->cctrl.ts.version - 1, curr_meta->cctrl.ts.tie_breaker_id,
            inv_ptr->op_meta.ts.version, inv_ptr->op_meta.ts.tie_breaker_id)) {
      //							printf("Received
      // an invalidation with >= timestamp\n");
      /// Update Value, TS and last_writer_id
      //				curr_meta->last_writer_id =
      // inv_ptr->op_meta.sender;
      kv_ptr->val_len =
          inv_ptr->op_meta.val_len + sizeof(spacetime_object_meta);
      if (ENABLE_ASSERTIONS) {
        //					assert(kv_ptr->val_len ==
        // KVS_VALUE_SIZE
        //>> SHIFT_BITS);
        assert(inv_ptr->op_meta.val_len == ST_VALUE_SIZE >> SHIFT_BITS);
      }
      memcpy(kv_value_ptr, inv_ptr->value, ST_VALUE_SIZE);
      /// Update state

      switch (curr_meta->state) {
        case VALID_STATE:
          if (machine_id != tail_id())  // Tail never gets invalid
            curr_meta->state = INVALID_STATE;
          break;
        case INVALID_STATE:
          break;
        default:
          assert(0);
      }
      cctrl_unlock_custom_version(&curr_meta->cctrl,
                                  inv_ptr->op_meta.ts.tie_breaker_id,
                                  inv_ptr->op_meta.ts.version);
    } else if (timestamp_is_equal(curr_meta->cctrl.ts.version - 1,
                                  curr_meta->cctrl.ts.tie_breaker_id,
                                  inv_ptr->op_meta.ts.version,
                                  inv_ptr->op_meta.ts.tie_breaker_id))
      assert(0);
    else
      cctrl_unlock_dec_version(&curr_meta->cctrl);
  }
  inv_ptr->op_meta.opcode = ST_INV_SUCCESS;

  if (inv_ptr->op_meta.initiator == machine_id && machine_id == tail_id())
    cr_complete_local_write(read_write_op, inv_ptr->buff_idx,
                            (uint64_t*)&inv_ptr->op_meta.key);

  if (ENABLE_ASSERTIONS) assert(inv_ptr->op_meta.opcode == ST_INV_SUCCESS);
}

static inline void
cr_exec_ack(spacetime_ack_t* ack_ptr, struct mica_op* kv_ptr,
            spacetime_op_t* read_write_op)
{
  if (ENABLE_ASSERTIONS) assert(machine_id != tail_id());

  // the following variables used to validate atomicity between a lock-free read
  // of an object
  spacetime_object_meta lock_free_read_meta;
  spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;
  if (ack_ptr->opcode != ST_OP_ACK) assert(0);

  uint32_t debug_cntr = 0;
  do {  // Lock free read of keys meta
    if (ENABLE_ASSERTIONS) {
      debug_cntr++;
      if (debug_cntr == M_4) {
        printf("Worker stuck on a lock-free read (for ACK)\n");
        debug_cntr = 0;
      }
    }
    lock_free_read_meta = *curr_meta;
  } while (!cctrl_timestamp_is_same_and_valid(&lock_free_read_meta.cctrl,
                                              &curr_meta->cctrl));

  if (ENABLE_ASSERTIONS)
    assert(!timestamp_is_smaller(lock_free_read_meta.cctrl.ts.version,
                                 lock_free_read_meta.cctrl.ts.tie_breaker_id,
                                 ack_ptr->ts.version,
                                 ack_ptr->ts.tie_breaker_id));

  if (timestamp_is_equal(ack_ptr->ts.version, ack_ptr->ts.tie_breaker_id,
                         lock_free_read_meta.cctrl.ts.version,
                         lock_free_read_meta.cctrl.ts.tie_breaker_id)) {
    /// Lock and check again if ack TS == last local write
    cctrl_lock(&curr_meta->cctrl);
    if (timestamp_is_equal(ack_ptr->ts.version, ack_ptr->ts.tie_breaker_id,
                           curr_meta->cctrl.ts.version - 1,
                           curr_meta->cctrl.ts.tie_breaker_id)) {
      switch (curr_meta->state) {
        case INVALID_STATE:
          curr_meta->state = VALID_STATE;
          ack_ptr->opcode = ST_LAST_ACK_SUCCESS;
          break;
        case VALID_STATE:
        default:
          assert(0);
      }
    }
    cctrl_unlock_dec_version(&curr_meta->cctrl);
  }

  if (machine_id == ack_ptr->initiator)
    cr_complete_local_write(read_write_op, ack_ptr->buff_idx,
                            (uint64_t*)&ack_ptr->key);

  ack_ptr->opcode = ST_LAST_ACK_SUCCESS;
}

//////////// Dispatcher functions

static inline uint8_t
cr_skip_dispatcher(enum cr_type_t cr_type, void* ptr)
{
  switch (cr_type) {
    case Local_ops:
      return cr_skip_op(ptr);
    case Invs:
      return cr_skip_inv(ptr);
    case Acks:
      return cr_skip_ack(ptr);
    case Remote_reads:
      return cr_skip_remote_reads(ptr);
    case Remote_writes:
      return cr_skip_remote_writes(ptr);
    default:
      assert(0);
  }
}

static inline void
cr_assertions_dispatcher(enum cr_type_t cr_type, void* ptr)
{
  if (ENABLE_ASSERTIONS) switch (cr_type) {
      case Invs:
        cr_assertions_inv(ptr);
      case Acks:
      case Remote_writes:
      case Local_ops:
      case Remote_reads:
        break;
      default:
        assert(0);
    }
}

static inline void
cr_exec_dispatcher(enum cr_type_t cr_type, void* op_ptr, struct mica_op* kv_ptr,
                   uint8_t idx, spacetime_op_t* read_write_op)
{
  switch (cr_type) {
    case Invs:
      cr_exec_inv(op_ptr, kv_ptr, read_write_op);
      break;
    case Acks:
      cr_exec_ack(op_ptr, kv_ptr, read_write_op);
      break;
    case Remote_writes:
      cr_exec_write(op_ptr, kv_ptr);
      break;
    case Local_ops:
      cr_exec_op(op_ptr, kv_ptr, idx);
      break;
    case Remote_reads:
      cr_exec_remote_reads(op_ptr, kv_ptr);
      break;
    default:
      assert(0);
  }
}

//////////////////////////////////////////////////
//////////// Batch function //////////////////////
void
cr_batch_ops_to_KVS(enum cr_type_t cr_type, uint8_t* op_array, int op_num,
                    uint16_t sizeof_op_elem, spacetime_op_t* read_write_op)
{
#if SPACETIME_DEBUG == 1
  // assert(kv.hash_table != NULL);
  assert(op_array != NULL);
  assert(op_num > 0 && op_num <= CACHE_BATCH_SIZE);
  assert(resp != NULL);
#endif

#if SPACETIME_DEBUG == 2
  for (I = 0; I < op_num; I++)
    mica_print_op(&(*op_array)[I]);
#endif
  int key_in_store[CR_MAX_BATCH_SIZE];  // Is this key in the datastore?
  unsigned int tag[CR_MAX_BATCH_SIZE];
  //	unsigned int bkt[CR_MAX_BATCH_SIZE];
  uint64_t bkt[CR_MAX_BATCH_SIZE];
  struct mica_bkt* bkt_ptr[CR_MAX_BATCH_SIZE];
  struct mica_op* kv_ptr[CR_MAX_BATCH_SIZE];  // Ptr to KV item in log

  if (ENABLE_ASSERTIONS) assert(read_write_op != NULL || cr_type != Acks);

  // We first lookup the key in the datastore.
  // The first two @I loops work for both GETs and PUTs.
  for (int I = 0; I < op_num; I++) {
    spacetime_op_meta_t* op_ptr =
        (spacetime_op_meta_t*)&op_array[sizeof_op_elem * I];
    cr_assertions_dispatcher(cr_type, op_ptr);
    if (cr_skip_dispatcher(cr_type, op_ptr)) continue;

    bkt[I] = op_ptr->key.bkt & kv.hash_table.bkt_mask;
    bkt_ptr[I] = &kv.hash_table.ht_index[bkt[I]];
    __builtin_prefetch(bkt_ptr[I], 0, 0);
    tag[I] = op_ptr->key.tag;

    key_in_store[I] = 0;
    kv_ptr[I] = NULL;
  }

  for (int I = 0; I < op_num; I++) {
    spacetime_op_meta_t* op_ptr =
        (spacetime_op_meta_t*)&op_array[sizeof_op_elem * I];
    if (cr_skip_dispatcher(cr_type, op_ptr)) continue;
    for (int j = 0; j < 8; j++) {
      if (bkt_ptr[I]->slots[j].in_use == 1 &&
          bkt_ptr[I]->slots[j].tag == tag[I]) {
        uint64_t log_offset =
            bkt_ptr[I]->slots[j].offset & kv.hash_table.log_mask;
        // We can interpret the log entry as mica_op, even though it
        // may not contain the full MICA_MAX_VALUE value.
        kv_ptr[I] = (struct mica_op*)&kv.hash_table.ht_log[log_offset];

        // Small values (1--64 bytes) can span 2 cache lines
        __builtin_prefetch(kv_ptr[I], 0, 0);
        __builtin_prefetch((uint8_t*)kv_ptr[I] + 64, 0, 0);

        // Detect if the head has wrapped around for this index entry
        if (kv.hash_table.log_head - bkt_ptr[I]->slots[j].offset >=
            kv.hash_table.log_cap) {
          kv_ptr[I] = NULL;  // If so, we mark it "not found"
        }

        break;
      }
    }
  }

  for (int I = 0; I < op_num; I++) {
    spacetime_op_meta_t* op_ptr =
        (spacetime_op_meta_t*)&op_array[sizeof_op_elem * I];
    if (cr_skip_dispatcher(cr_type, op_ptr)) continue;
    if (kv_ptr[I] != NULL) {
      // We had a tag match earlier. Now compare log entry.
      long long* key_ptr_log = (long long*)kv_ptr[I];
      long long* key_ptr_req = (long long*)&op_ptr->key;

      if (key_ptr_log[1] == key_ptr_req[0]) {  // Key Found 8 Byte keys
        key_in_store[I] = 1;
        cr_exec_dispatcher(cr_type, op_ptr, kv_ptr[I], (uint8_t)I,
                           read_write_op);
      }
    }

    if (key_in_store[I] ==
        0)  // KVS miss --> We get here if either tag or log key match failed
      op_ptr->state = ST_MISS;
  }

  if (ENABLE_ASSERTIONS)
    if (cr_type == Acks)
      for (int I = 0; I < max_batch_size; I++)
        assert(read_write_op[I].op_meta.opcode == ST_OP_GET ||
               read_write_op[I].op_meta.state == ST_MISS ||
               read_write_op[I].op_meta.state == ST_EMPTY ||
               read_write_op[I].op_meta.state == ST_PUT_STALL ||
               read_write_op[I].op_meta.state == ST_PUT_SUCCESS ||
               read_write_op[I].op_meta.state == ST_PUT_COMPLETE ||
               read_write_op[I].op_meta.state == ST_IN_PROGRESS_PUT ||
               read_write_op[I].op_meta.state ==
                   ST_OP_MEMBERSHIP_CHANGE ||  /// TODO check this
               read_write_op[I].op_meta.state == ST_IN_PROGRESS_REPLAY);
}


================================================
FILE: src/CR/cr_worker.c
================================================
#include <spacetime.h>
#include <time.h>
#include "../../include/utils/concur_ctrl.h"
#include "inline-util.h"
#include "util.h"

///
#include "../../include/utils/time_rdtsc.h"
#include "../../include/wings/wings.h"
///

static inline uint8_t
head_id(void)
{
  return (uint8_t)0;
}

static inline uint8_t
tail_id(void)
{
  return machine_num - 1;
}

static inline uint8_t
next_node_in_chain(void)
{
  return (uint8_t)((machine_id + 1) % machine_num);
}

static inline uint8_t
prev_node_in_chain(void)
{
  return (uint8_t)(machine_id == 0 ? tail_id() : machine_id - 1);
}

int
inv_skip_or_fwd_to_next_node(uint8_t* req)
{
  spacetime_inv_t* inv_req = (spacetime_inv_t*)req;
  return inv_req->op_meta.opcode == ST_INV_SUCCESS
             ? next_node_in_chain()
             : -1;  // invs should only be fwded to next node
}

void
inv_fwd_modify_elem_after_send(uint8_t* req)
{
  spacetime_inv_t* inv_req = (spacetime_inv_t*)req;

  // empty inv buffer
  if (inv_req->op_meta.opcode == ST_INV_SUCCESS ||
      inv_req->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE)
    inv_req->op_meta.opcode = ST_EMPTY;

  else
    assert(0);
}

void
inv_fwd_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)
{
  spacetime_inv_t* inv_recv = (spacetime_inv_t*)triggering_req;
  spacetime_inv_t* inv_to_send = (spacetime_inv_t*)msg_to_send;

  // Copy op to inv and set opcode
  memcpy(inv_to_send, inv_recv, sizeof(spacetime_inv_t));
  inv_to_send->op_meta.opcode = ST_OP_INV;
}

int
inv_skip_or_get_sender_id(uint8_t* req)
{
  spacetime_op_t* op_req = (spacetime_op_t*)req;

  if (ENABLE_ASSERTIONS) {
    assert(is_input_code(op_req->op_meta.opcode));
    assert(is_response_code(op_req->op_meta.state) ||
           is_bucket_state_code(op_req->op_meta.state));
  }

  return op_req->op_meta.state == ST_PUT_SUCCESS
             ? next_node_in_chain()
             : -1;  // since invs should only be fwded to next node
}

void
inv_modify_elem_after_send(uint8_t* req)
{
  spacetime_op_t* op_req = (spacetime_op_t*)req;

  if (op_req->op_meta.state == ST_PUT_SUCCESS)
    op_req->op_meta.state = ST_IN_PROGRESS_PUT;
  else
    assert(0);
}

void
inv_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)
{
  if (ENABLE_ASSERTIONS) assert(machine_id == head_id());

  spacetime_op_t* op = (spacetime_op_t*)triggering_req;
  spacetime_inv_t* inv_to_send = (spacetime_inv_t*)msg_to_send;

  // Copy op to inv, set sender and opcode
  memcpy(inv_to_send, op, sizeof(spacetime_inv_t));

  inv_to_send->op_meta.opcode = ST_OP_INV;
  inv_to_send->op_meta.initiator = (uint8_t)machine_id;
}

int
remote_write_skip_or_get_sender_id(uint8_t* req)
{
  spacetime_op_t* op_req = (spacetime_op_t*)req;

  if (ENABLE_ASSERTIONS) {
    assert(is_input_code(op_req->op_meta.opcode));
    assert(is_response_code(op_req->op_meta.state) ||
           is_bucket_state_code(op_req->op_meta.state));
  }

  return op_req->op_meta.state == ST_PUT_SUCCESS
             ? head_id()
             : -1;  // send remote writes to head
}

void
remote_write_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)
{
  if (ENABLE_ASSERTIONS) assert(machine_id != head_id());

  spacetime_op_t* op = (spacetime_op_t*)triggering_req;
  spacetime_inv_t* inv_to_send = (spacetime_inv_t*)msg_to_send;

  // Copy op to inv, set sender and opcode
  memcpy(inv_to_send, op, sizeof(spacetime_inv_t));

  inv_to_send->op_meta.state = ST_NEW;
  inv_to_send->op_meta.opcode = ST_OP_PUT;
  inv_to_send->initiator = (uint8_t)machine_id;
  inv_to_send->op_meta.initiator = (uint8_t)machine_id;
}

int
remote_write_head_skip_or_get_sender_id(uint8_t* req)
{
  spacetime_op_t* op_req = (spacetime_op_t*)req;

  if (ENABLE_ASSERTIONS) {
    assert(machine_id == head_id());
    assert(is_input_code(op_req->op_meta.opcode) ||
           op_req->op_meta.opcode == ST_EMPTY);
    assert(is_response_code(op_req->op_meta.state) ||
           is_bucket_state_code(op_req->op_meta.state));
  }

  return op_req->op_meta.state == ST_PUT_SUCCESS
             ? next_node_in_chain()
             : -1;  // remote writes must always be fwded to head
}

void
remote_write_head_copy_and_modify_elem(uint8_t* msg_to_send,
                                       uint8_t* triggering_req)
{
  spacetime_op_t* op = (spacetime_op_t*)triggering_req;
  spacetime_inv_t* inv_to_send = (spacetime_inv_t*)msg_to_send;

  // Copy op to inv, set sender and opcode
  memcpy(inv_to_send, op, sizeof(spacetime_inv_t));

  inv_to_send->op_meta.opcode = ST_OP_INV;
  inv_to_send->op_meta.initiator = op->initiator;
}

void
remote_write_head_modify_elem_after_send(uint8_t* req)
{
  spacetime_op_t* op_req = (spacetime_op_t*)req;

  if (op_req->op_meta.state == ST_PUT_SUCCESS)
    op_req->op_meta.state = ST_SEND_CRD;
  else
    assert(0);
}

void
ack_fwd_modify_elem_after_send(uint8_t* req)
{
  spacetime_ack_t* ack_req = (spacetime_ack_t*)req;

  if (ENABLE_ASSERTIONS) assert(ack_req->opcode == ST_LAST_ACK_SUCCESS);

  ack_req->opcode = ST_EMPTY;
}

int
ack_fwd_skip_or_get_sender_id(uint8_t* req)
{
  spacetime_ack_t* ack_req = (spacetime_ack_t*)req;
  if (ack_req->opcode == ST_ACK_SUCCESS) {
    ack_req->opcode = ST_EMPTY;
    return -1;
  } else if (ack_req->opcode == ST_EMPTY)
    return -1;

  if (ENABLE_ASSERTIONS) assert(ack_req->opcode == ST_LAST_ACK_SUCCESS);

  return prev_node_in_chain();
}

void
ack_fwd_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)
{
  spacetime_ack_t* ack_to_send = (spacetime_ack_t*)msg_to_send;
  memcpy(ack_to_send, triggering_req,
         sizeof(spacetime_ack_t));  // copy req to next_req_ptr

  ack_to_send->opcode = ST_OP_ACK;
}

int
ack_skip_or_get_sender_id(uint8_t* req)
{
  spacetime_inv_t* inv_req = (spacetime_inv_t*)req;

  if (ENABLE_ASSERTIONS)
    assert(inv_req->op_meta.opcode == ST_INV_SUCCESS ||
           inv_req->op_meta.opcode == ST_EMPTY);

  return prev_node_in_chain();
}

void
ack_modify_elem_after_send(uint8_t* req)
{
  spacetime_inv_t* inv_req = (spacetime_inv_t*)req;

  // empty inv buffer
  if (inv_req->op_meta.opcode == ST_INV_SUCCESS ||
      inv_req->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE)
    inv_req->op_meta.opcode = ST_EMPTY;
  else
    assert(0);
}

void
ack_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)
{
  spacetime_ack_t* ack_to_send = (spacetime_ack_t*)msg_to_send;
  spacetime_inv_t* inv_ptr = (spacetime_inv_t*)triggering_req;

  memcpy(ack_to_send, inv_ptr,
         sizeof(spacetime_ack_t));  // copy req to next_req_ptr

  ack_to_send->opcode = ST_OP_ACK;
  ack_to_send->buff_idx = inv_ptr->buff_idx;
}

int
rem_write_crd_skip_or_get_sender_id(uint8_t* req)
{
  spacetime_op_t* op_ptr = (spacetime_op_t*)req;

  if (ENABLE_ASSERTIONS)
    assert(op_ptr->op_meta.state == ST_EMPTY ||
           op_ptr->op_meta.state == ST_SEND_CRD ||
           op_ptr->op_meta.state == ST_PUT_STALL ||
           op_ptr->op_meta.state == ST_PUT_SUCCESS);

  return op_ptr->op_meta.state == ST_SEND_CRD ? op_ptr->initiator : -1;
}

void
rem_write_crd_modify_elem_after_send(uint8_t* req)
{
  spacetime_op_t* op = (spacetime_op_t*)req;

  // empty inv buffer
  if (op->op_meta.state == ST_SEND_CRD)
    op->op_meta.state = ST_EMPTY;
  else
    assert(0);
}

int
inv_crd_skip_or_get_sender_id(uint8_t* req)
{
  spacetime_inv_t* op_ptr = (spacetime_inv_t*)req;

  if (ENABLE_ASSERTIONS)
    assert(op_ptr->op_meta.opcode == ST_EMPTY ||
           op_ptr->op_meta.opcode == ST_INV_SUCCESS);

  return op_ptr->op_meta.opcode == ST_INV_SUCCESS ? prev_node_in_chain() : -1;
}

void
inv_crd_modify_elem_after_send(uint8_t* req)
{
  if (ENABLE_ASSERTIONS) {
    spacetime_inv_t* op = (spacetime_inv_t*)req;
    assert(op->op_meta.opcode == ST_INV_SUCCESS);
  }
}

int
remote_read_skip_or_get_sender_id(uint8_t* req)
{
  spacetime_op_t* op_req = (spacetime_op_t*)req;

  if (ENABLE_ASSERTIONS) {
    assert(is_input_code(op_req->op_meta.opcode));
    assert(is_response_code(op_req->op_meta.state) ||
           is_bucket_state_code(op_req->op_meta.state));
  }

  return op_req->op_meta.state == ST_GET_STALL
             ? tail_id()
             : -1;  // send remote writes to head
}

void
remote_read_modify_elem_after_send(uint8_t* req)
{
  spacetime_op_t* op_req = (spacetime_op_t*)req;

  if (op_req->op_meta.state == ST_GET_STALL)
    op_req->op_meta.state = ST_IN_PROGRESS_GET;
  else
    assert(0);
}

void
remote_read_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)
{
  if (ENABLE_ASSERTIONS) assert(machine_id != tail_id());

  spacetime_op_t* op = (spacetime_op_t*)triggering_req;
  spacetime_op_t* op_to_send = (spacetime_op_t*)msg_to_send;

  // Copy op to inv, set sender and opcode
  memcpy(op_to_send, op, sizeof(spacetime_op_t));

  op_to_send->op_meta.state = ST_NEW;
  op_to_send->op_meta.opcode = ST_OP_GET;
  op_to_send->initiator = (uint8_t)machine_id;
  op_to_send->op_meta.initiator = (uint8_t)machine_id;
}

int
remote_read_resp_skip_or_get_sender_id(uint8_t* req)
{
  spacetime_op_t* op_req = (spacetime_op_t*)req;

  if (ENABLE_ASSERTIONS) {
    if (op_req->op_meta.opcode != ST_OP_GET) {
      printf("Opcode: %d, state: %d\n", op_req->op_meta.opcode,
             op_req->op_meta.state);
      printf("Opcode: %s, state: %s\n", code_to_str(op_req->op_meta.opcode),
             code_to_str(op_req->op_meta.state));
    }
    assert(op_req->op_meta.opcode == ST_OP_GET);
    assert(op_req->op_meta.state == ST_GET_COMPLETE);
  }

  return op_req->initiator;  // send remote writes to head
}

void
remote_read_resp_modify_elem_after_send(uint8_t* req)
{
  spacetime_op_t* op_req = (spacetime_op_t*)req;

  if (op_req->op_meta.state == ST_GET_COMPLETE)
    op_req->op_meta.state = ST_EMPTY;
  else {
    printf("St_opcode: %s\n", code_to_str(op_req->op_meta.state));
    assert(0);
  }
}

void
remote_read_resp_copy_and_modify_elem(uint8_t* msg_to_send,
                                      uint8_t* triggering_req)
{
  if (ENABLE_ASSERTIONS) assert(machine_id == tail_id());

  spacetime_op_t* op = (spacetime_op_t*)triggering_req;
  spacetime_op_t* op_to_send = (spacetime_op_t*)msg_to_send;

  // Copy op to inv, set sender and opcode
  memcpy(op_to_send, op, sizeof(spacetime_op_t));
}

void
print_ops_and_remote_write_ops(spacetime_op_t* ops,
                               spacetime_op_t* remote_writes)
{
  //	for(int i = 0; i < MAX_BATCH_KVS_OPS_SIZE; ++i)
  for (int i = 0; i < max_batch_size; ++i)
    printf("ops[%d]: state-> %s, key-> %lu \n", i,
           code_to_str(ops[i].op_meta.state),
           *((uint64_t*)&ops[i].op_meta.key));

  if (machine_id == head_id())
    //		for(int i = 0; i < MAX_BATCH_KVS_OPS_SIZE; ++i)
    for (int i = 0; i < max_batch_size; ++i)
      printf("remote_writes[%d]: state-> %s, key-> %lu \n", i,
             code_to_str(remote_writes[i].op_meta.state),
             *((uint64_t*)&remote_writes[i].op_meta.key));
}

void
print_total_stalls_due_to_credits(ud_channel_t* inv_ud_c,
                                  ud_channel_t* ack_ud_c,
                                  ud_channel_t* rem_writes_ud_c,
                                  ud_channel_t* rem_reads_ud_c)
{
  // Stalls
  colored_printf(GREEN, "$$$ CRD STALLs : %s %d, %s %d, %s %d,",
                 inv_ud_c->qp_name, inv_ud_c->stats.send_total_msgs,
                 ack_ud_c->qp_name, ack_ud_c->stats.send_total_msgs,
                 rem_writes_ud_c->qp_name,
                 rem_writes_ud_c->stats.send_total_msgs);
  if (CR_ENABLE_REMOTE_READS)
    colored_printf(GREEN, ", %s %d\n", rem_reads_ud_c->qp_name,
                   rem_reads_ud_c->stats.send_total_msgs);
  else
    printf("\n");
}

void
print_total_send_recv_msgs_n_credits(
    ud_channel_t* inv_ud_c, ud_channel_t* inv_crd_ud_c, ud_channel_t* ack_ud_c,
    ud_channel_t* rem_writes_ud_c, ud_channel_t* crd_ud_c,
    ud_channel_t* rem_reads_ud_c, ud_channel_t* rem_read_resp_ud_c)
{
  // Sends
  colored_printf(GREEN, "--> Total Send: %s %d", inv_ud_c->qp_name,
                 inv_ud_c->stats.send_total_msgs);
  if (CR_ENABLE_EARLY_INV_CRDS)
    colored_printf(GREEN, ", %s %d", inv_crd_ud_c->qp_name,
                   inv_crd_ud_c->stats.send_total_msgs);
  colored_printf(GREEN, ", %s %d, %s %d, %s %d", ack_ud_c->qp_name,
                 ack_ud_c->stats.send_total_msgs, rem_writes_ud_c->qp_name,
                 rem_writes_ud_c->stats.send_total_msgs, crd_ud_c->qp_name,
                 crd_ud_c->stats.send_total_msgs);
  if (CR_ENABLE_REMOTE_READS)
    colored_printf(GREEN, ", %s %d, %s %d\n", rem_reads_ud_c->qp_name,
                   rem_reads_ud_c->stats.send_total_msgs,
                   rem_read_resp_ud_c->qp_name,
                   rem_read_resp_ud_c->stats.send_total_msgs);
  else
    printf("\n");

  // Receives
  colored_printf(GREEN, "vvv Total Recv: %s %d", inv_ud_c->qp_name,
                 inv_ud_c->stats.recv_total_msgs);
  if (CR_ENABLE_EARLY_INV_CRDS)
    colored_printf(GREEN, ", %s %d", inv_crd_ud_c->qp_name,
                   inv_crd_ud_c->stats.recv_total_msgs);
  colored_printf(GREEN, ", %s %d, %s %d, %s %d", ack_ud_c->qp_name,
                 ack_ud_c->stats.recv_total_msgs, rem_writes_ud_c->qp_name,
                 rem_writes_ud_c->stats.recv_total_msgs, crd_ud_c->qp_name,
                 crd_ud_c->stats.recv_total_msgs);
  if (CR_ENABLE_REMOTE_READS)
    colored_printf(GREEN, ", %s %d, %s %d\n", rem_reads_ud_c->qp_name,
                   rem_reads_ud_c->stats.recv_total_msgs,
                   rem_read_resp_ud_c->qp_name,
                   rem_read_resp_ud_c->stats.recv_total_msgs);
  else
    printf("\n");

  // Credits
  uint8_t remote_node =
      (uint8_t)(machine_id == head_id() ? next_node_in_chain() : head_id());
  printf("Inv credits: %d, ack credits: %d, remote_write_crds: %d\n",
         inv_ud_c->credits_per_channels[remote_node],
         ack_ud_c->credits_per_channels[remote_node],
         rem_writes_ud_c->credits_per_channels[head_id()]);
}

static inline void
cr_complete_local_reads(spacetime_op_t* remote_reads_resps,
                        uint16_t remote_read_resps_polled, spacetime_op_t* ops)
{
  for (int i = 0; i < remote_read_resps_polled; ++i) {
    uint16_t idx = remote_reads_resps[i].buff_idx;
    /// completed read / write --> remove it from the ops buffer
    if (ENABLE_ASSERTIONS) {
      assert(ops[idx].op_meta.state == ST_IN_PROGRESS_GET);
      assert(((uint64_t*)&ops[idx].op_meta.key)[0] ==
             ((uint64_t*)&remote_reads_resps[i].op_meta.key)[0]);
    }

    if (ops[idx].op_meta.opcode == ST_OP_GET)
      ops[idx].op_meta.state = ST_GET_COMPLETE;
    else
      assert(0);
  }
}

// returns first free slot within a range [start_pos, end_pos) or -1 if all are
// occupied
static inline int
get_first_free_slot(const uint8_t* free_slot_array, uint16_t start_pos,
                    uint16_t end_pos)
{
  if (ENABLE_ASSERTIONS) assert(end_pos > start_pos);

  for (int i = start_pos; i < end_pos; ++i)
    if (free_slot_array[i] == 1) return i;
  return -1;
}

static inline uint16_t
cr_move_stalled_writes_to_top_n_return_free_space(spacetime_op_t* remote_writes)
{
  uint8_t free_slot_array[MAX_BATCH_KVS_OPS_SIZE] = {0};
  uint16_t free_slots = 0;
  uint16_t last_free_slot =
      0;  // used to avoid re-iterating already non-empty slots
  for (int i = 0; i < max_batch_size; ++i) {
    if (ENABLE_ASSERTIONS)
      assert(remote_writes[i].op_meta.state == ST_EMPTY ||
             remote_writes[i].op_meta.state == ST_PUT_STALL ||
             remote_writes[i].op_meta.state == ST_PUT_SUCCESS);

    if (remote_writes[i].op_meta.state == ST_EMPTY) {
      free_slots++;
      free_slot_array[i] = 1;

    } else if (free_slots > 0 &&
               (remote_writes[i].op_meta.state == ST_PUT_STALL ||
                remote_writes[i].op_meta.state == ST_PUT_SUCCESS)) {
      int next_free_slot =
          get_first_free_slot(free_slot_array, last_free_slot, (uint16_t)i);

      if (next_free_slot > -1) {
        free_slot_array[i] = 1;
        free_slot_array[next_free_slot] = 0;
        last_free_slot = (uint16_t)next_free_slot;
        // swap stalled request to the first free slot
        memcpy(&remote_writes[next_free_slot], &remote_writes[i],
               sizeof(spacetime_op_t));

        // empty this slot
        remote_writes[i].op_meta.state = ST_EMPTY;
        remote_writes[i].op_meta.opcode = ST_EMPTY;
      }
    }
  }

  if (ENABLE_ASSERTIONS)
    for (int i = 0; i < max_batch_size; ++i) {
      if (i < max_batch_size - free_slots)
        assert(remote_writes[i].op_meta.state == ST_PUT_STALL ||
               remote_writes[i].op_meta.state == ST_PUT_SUCCESS);
      else
        assert(remote_writes[i].op_meta.state == ST_EMPTY);
    }

  return free_slots;
}

static inline void
debugg(spacetime_op_t* ops, uint16_t worker_lid, int line_no)
{
  if (w_stats[worker_lid].total_loops > 0)
    for (int i = 0; i < max_batch_size; ++i) {
      if (!(ops[i].op_meta.opcode == ST_OP_PUT ||
            ops[i].op_meta.opcode == ST_OP_GET))
        printf("Line[%d]--> Op[%d]: %s, loop iter: %llu\n", line_no, i,
               code_to_str(ops[i].op_meta.opcode),
               w_stats[worker_lid].total_loops);
      assert(ops[i].op_meta.opcode == ST_OP_PUT ||
             ops[i].op_meta.opcode == ST_OP_GET);
    }
}

void*
run_worker(void* arg)
{
  assert(rmw_ratio == 0);
  assert(is_CR == 1);
  assert(credits_num % machine_num == 0);  // CR ONLY
  assert(ENABLE_COALESCE_OF_HOT_REQS == 0);

  /// WARNING: only defines (no dynamically passed cli arguments) work for cr
  /// worker
  assert(max_coalesce <= MAX_REQ_COALESCE);
  assert(num_workers <= MAX_WORKERS_PER_MACHINE);
  assert(max_batch_size <= MAX_BATCH_KVS_OPS_SIZE);
  assert(credits_num <= MAX_CREDITS_PER_REMOTE_WORKER_CR);
  const uint16_t credit_num = MAX_CREDITS_PER_REMOTE_WORKER_CR;

  struct thread_params params = *(struct thread_params*)arg;
  uint16_t worker_lid = (uint16_t)params.id;  // Local ID of this worker thread
  uint16_t worker_gid =
      (uint16_t)(machine_id * num_workers +
                 params.id);  // Global ID of this worker thread
  // TODO check if the previous assignment (below is the correct one)
  //    uint16_t worker_gid = (uint16_t) (machine_id * MAX_WORKERS_PER_MACHINE +
  //    params.id);	// Global ID of this worker thread

  /* --------------------------------------------------------
  ------------------- RDMA WINGS DECLARATIONS---------------
  ---------------------------------------------------------*/
  ud_channel_t ud_channels[CR_TOTAL_WORKER_UD_QPs];
  ud_channel_t* ud_channel_ptrs[CR_TOTAL_WORKER_UD_QPs];

  for (int i = 0; i < CR_TOTAL_WORKER_UD_QPs; ++i)
    ud_channel_ptrs[i] = &ud_channels[i];

  ud_channel_t* inv_ud_c = ud_channel_ptrs[CR_INV_UD_QP_ID];
  ud_channel_t* inv_crd_ud_c = ud_channel_ptrs[CR_INV_CRD_UD_QP_ID];
  ud_channel_t* ack_ud_c = ud_channel_ptrs[CR_ACK_UD_QP_ID];
  ud_channel_t* rem_reads_ud_c = ud_channel_ptrs[CR_REMOTE_READS_UD_QP_ID];
  ud_channel_t* rem_read_resp_ud_c =
      ud_channel_ptrs[CR_REMOTE_READS_RESP_UD_QP_ID];
  ud_channel_t* rem_writes_ud_c = ud_channel_ptrs[CR_REMOTE_WRITES_UD_QP_ID];
  ud_channel_t* rem_writes_crd_ud_c =
      ud_channel_ptrs[CR_REMOTE_WRITE_CRD_UD_QP_ID];

  const uint8_t is_bcast = 0;
  const uint8_t stats_on = 1;
  const uint8_t prints_on = 1;
  const uint8_t is_hdr_only = 0;
  const uint8_t expl_crd_ctrl = 0;
  const uint8_t disable_crd_ctrl = 0;

  char inv_qp_name[200], ack_qp_name[200], rem_writes_qp_name[200],
      rem_reads_qp_name[200], rem_read_resps_qp_name[200];
  sprintf(inv_qp_name, "%s[%d]", "\033[31mINV\033[0m", worker_lid);
  sprintf(ack_qp_name, "%s[%d]", "\033[33mACK\033[0m", worker_lid);
  sprintf(rem_writes_qp_name, "%s[%d]", "\033[1m\033[32mREMOTE_WRITES\033[0m",
          worker_lid);
  sprintf(rem_reads_qp_name, "%s[%d]", "\033[1m\033[32mREMOTE_READS\033[0m",
          worker_lid);
  sprintf(rem_read_resps_qp_name, "%s[%d]",
          "\033[1m\033[32mREMOTE_READ_RESPS\033[0m", worker_lid);

  uint8_t inv_inlining =
      (DISABLE_INLINING == 0 &&
       max_coalesce * sizeof(spacetime_inv_t) < WINGS_MAX_SUPPORTED_INLINING)
          ? 1
          : 0;
  uint8_t ack_inlining =
      (DISABLE_INLINING == 0 &&
       max_coalesce * sizeof(spacetime_ack_t) < WINGS_MAX_SUPPORTED_INLINING)
          ? 1
          : 0;
  uint8_t rem_writes_inlining = inv_inlining;
  uint8_t rem_reads_inlining = inv_inlining;

  if (CR_ENABLE_EARLY_INV_CRDS) {
    wings_ud_channel_init(inv_ud_c, inv_qp_name, REQ, MAX_REQ_COALESCE,
                          sizeof(spacetime_inv_t), 0, inv_inlining, is_hdr_only,
                          is_bcast, disable_crd_ctrl, 1, inv_crd_ud_c,
                          credit_num, machine_num, (uint8_t)machine_id,
                          stats_on, prints_on);

    wings_ud_channel_init(
        ack_ud_c, ack_qp_name, RESP, MAX_REQ_COALESCE, sizeof(spacetime_ack_t),
        0, ack_inlining, is_hdr_only, is_bcast, 1, expl_crd_ctrl, NULL,
        CR_ACK_CREDITS, machine_num, (uint8_t)machine_id, stats_on, prints_on);
  } else {
    wings_ud_channel_init(inv_ud_c, inv_qp_name, REQ, MAX_REQ_COALESCE,
                          sizeof(spacetime_inv_t), 0, inv_inlining, is_hdr_only,
                          is_bcast, disable_crd_ctrl, expl_crd_ctrl, ack_ud_c,
                          credit_num, machine_num, (uint8_t)machine_id,
                          stats_on, prints_on);

    wings_ud_channel_init(ack_ud_c, ack_qp_name, RESP, MAX_REQ_COALESCE,
                          sizeof(spacetime_ack_t), 0, ack_inlining, is_hdr_only,
                          is_bcast, disable_crd_ctrl, expl_crd_ctrl, inv_ud_c,
                          credit_num, machine_num, (uint8_t)machine_id,
                          stats_on, prints_on);
  }

  const uint16_t cr_remote_write_credits = credit_num / machine_num;
  wings_ud_channel_init(
      rem_writes_ud_c, rem_writes_qp_name, REQ, MAX_REQ_COALESCE,
      sizeof(spacetime_op_t), 0, rem_writes_inlining, is_hdr_only, is_bcast,
      disable_crd_ctrl, 1, rem_writes_crd_ud_c, cr_remote_write_credits,
      machine_num, (uint8_t)machine_id, stats_on, prints_on);

  ///////////////
  ///< 4th stage>
  if (CR_ENABLE_REMOTE_READS) {
    wings_ud_channel_init(rem_reads_ud_c, rem_reads_qp_name, REQ,
                          MAX_REQ_COALESCE, sizeof(spacetime_op_t), 0,
                          rem_reads_inlining, is_hdr_only, is_bcast,
                          disable_crd_ctrl, expl_crd_ctrl, rem_read_resp_ud_c,
                          CR_REMOTE_READS_CREDITS, machine_num,
                          (uint8_t)machine_id, stats_on, prints_on);

    wings_ud_channel_init(rem_read_resp_ud_c, rem_read_resps_qp_name, RESP,
                          MAX_REQ_COALESCE, sizeof(spacetime_op_t), 0,
                          rem_reads_inlining, is_hdr_only, is_bcast,
                          disable_crd_ctrl, expl_crd_ctrl, rem_reads_ud_c,
                          CR_REMOTE_READS_CREDITS, machine_num,
                          (uint8_t)machine_id, stats_on, prints_on);
  }
  ///</4th stage>
  ///////////////

  wings_setup_channel_qps_and_recvs(ud_channel_ptrs, CR_TOTAL_WORKER_UD_QPs,
                                    g_share_qs_barrier, worker_lid);

  /* -------------------------------------------------------
  ------------------- OTHER DECLARATIONS--------------------
  ---------------------------------------------------------*/
  // Intermediate buffs where reqs are copied from incoming_* buffs in order to
  // get passed to the KVS
  spacetime_op_t* ops;
  spacetime_inv_t* inv_recv_ops;
  spacetime_ack_t* ack_recv_ops;
  spacetime_val_t* val_recv_ops;  // UNUSED!
  uint32_t coh_ops_len =
      (uint32_t)(credits_num * machine_num *
                 max_coalesce);  // credits * remote_machines * max_req_coalesce

  setup_kvs_buffs(&ops, &inv_recv_ops, &ack_recv_ops, &val_recv_ops);

  // Remote writes init
  spacetime_op_t* remote_writes =
      memalign(4096, max_batch_size * (sizeof(spacetime_op_t)));
  memset(remote_writes, 0, max_batch_size * (sizeof(spacetime_op_t)));
  for (int i = 0; i < max_batch_size; ++i) {
    remote_writes[i].op_meta.state = ST_EMPTY;
    remote_writes[i].op_meta.opcode = ST_EMPTY;
  }

  ///////////////
  ///< 4th stage>
  // Remote reads buffer: used for polling remote reads on tail & remote read
  // responses on the rest nodes
  spacetime_op_t* remote_reads =
      memalign(4096, max_batch_size * (sizeof(spacetime_op_t)));
  memset(remote_reads, 0, max_batch_size * (sizeof(spacetime_op_t)));
  for (int i = 0; i < max_batch_size; ++i) {
    remote_reads[i].op_meta.state = ST_EMPTY;
    remote_reads[i].op_meta.opcode = ST_EMPTY;
  }
  ///</4th stage>
  ///////////////

  struct spacetime_trace_command* trace;
  trace_init(&trace, worker_gid);

  //// <UNUSED>
  spacetime_op_t* n_hottest_keys_in_ops_get[COALESCE_N_HOTTEST_KEYS];
  spacetime_op_t* n_hottest_keys_in_ops_put[COALESCE_N_HOTTEST_KEYS];
  for (int i = 0; i < COALESCE_N_HOTTEST_KEYS; ++i) {
    n_hottest_keys_in_ops_get[i] = NULL;
    n_hottest_keys_in_ops_put[i] = NULL;
  }
  ////</UNUSED>

  uint8_t has_outstanding_invs = 0;
  uint8_t has_outstanding_rem_writes = 0;
  uint32_t trace_iter = 0;
  uint16_t rolling_idx = 0, remote_reads_rolling_idx = 0;
  uint16_t invs_polled = 0, acks_polled = 0, remote_writes_polled = 0;
  uint32_t num_of_iters_serving_op[MAX_BATCH_KVS_OPS_SIZE] = {0};

  uint16_t free_rem_write_slots = max_batch_size;
  /// Spawn stats thread
  if (worker_lid == 0)
    if (spawn_stats_thread() != 0)
      colored_printf(RED, "Stats thread was not successfully spawned \n");

  struct timespec stopwatch_for_req_latency;
  /* -----------------------------------------------------
 ------------------------Main Loop--------------------
     ----------------------------------------------------- */
  while (true) {
    if (unlikely(w_stats[worker_lid].total_loops % M_16 == 0)) {
      // Check something periodically
      //	        print_total_stalls_due_to_credits(inv_ud_c, ack_ud_c,
      // rem_writes_ud_c, rem_reads_ud_c);
      //			print_total_send_recv_msgs_n_credits(inv_ud_c,
      // inv_crd_ud_c, ack_ud_c,
      // rem_writes_ud_c, rem_writes_crd_ud_c,
      // rem_reads_ud_c, rem_read_resp_ud_c);
      // print_ops_and_remote_write_ops(ops, remote_writes);
    }

    /// DONE
    // 1st stage: head only initiate requests
    // [DONE] 2nd stage: + rest nodes initiate (local) reads
    // [DONE] 3rd stage: + rest nodes initiate (remote) writes via head [DONE]
    // 4th stage: + rest nodes initiate remote reads when invalid    [DONE]
    // 5th stage: + add early INV credits to pipeline more reqs      [DONE]
    // 6th stage: + poll for remote writes even though stalled exist [DONE]
    // 7th stage: + poll for messages instead of pkts (ie if you have
    //              empty space buff slots < max_coalesce poll pkt
    //              and buffer additional packets                    [DONE]
    // 8th stage: + Do not stall writes that found Invalid on head   [DONE]

    if (!CR_ENABLE_ONLY_HEAD_REQS || machine_id == head_id()) {
      refill_ops(&trace_iter, worker_lid, trace, ops, num_of_iters_serving_op,
                 &stopwatch_for_req_latency, n_hottest_keys_in_ops_get,
                 n_hottest_keys_in_ops_put);
      cr_batch_ops_to_KVS(Local_ops, (uint8_t*)ops, max_batch_size,
                          sizeof(spacetime_op_t), NULL);

      // TODO: moved
      stop_latency_of_completed_reads(ops, worker_lid,
                                      &stopwatch_for_req_latency);
    }

    if (update_ratio > 0) {
      if (machine_id == head_id()) {
        const uint16_t max_outstanding_writes =
            (machine_num - 1) * CR_ACK_CREDITS;

        if (!CR_ENABLE_EARLY_INV_CRDS ||
            inv_ud_c->stats.send_total_msgs - ack_ud_c->stats.recv_total_msgs <=
                max_outstanding_writes) {  /// Initiate INVs for head writes
          wings_issue_pkts(
              inv_ud_c, NULL, (uint8_t*)ops, max_batch_size,
              sizeof(spacetime_op_t), &rolling_idx, inv_skip_or_get_sender_id,
              inv_modify_elem_after_send, inv_copy_and_modify_elem);
        }

        ///////////////
        ///< 3rd stage>
        if (!CR_ENABLE_ONLY_HEAD_REQS) {
          wings_poll_buff_and_post_recvs(
              rem_writes_ud_c, free_rem_write_slots,
              (uint8_t*)&remote_writes[max_batch_size - free_rem_write_slots]);

          cr_batch_ops_to_KVS(Remote_writes, (uint8_t*)remote_writes,
                              max_batch_size, sizeof(spacetime_op_t), NULL);

          if (!CR_ENABLE_EARLY_INV_CRDS ||
              inv_ud_c->stats.send_total_msgs -
                      ack_ud_c->stats.recv_total_msgs <=
                  max_outstanding_writes) {  /// Initiate INVs for remotes
                                             /// writes
            wings_issue_pkts(inv_ud_c, NULL, (uint8_t*)remote_writes,
                             max_batch_size, sizeof(spacetime_op_t), NULL,
                             remote_write_head_skip_or_get_sender_id,
                             remote_write_head_modify_elem_after_send,
                             remote_write_head_copy_and_modify_elem);

            /// Issue credits for remotes writes
            wings_issue_credits(rem_writes_crd_ud_c, NULL,
                                (uint8_t*)remote_writes, max_batch_size,
                                sizeof(spacetime_op_t),
                                rem_write_crd_skip_or_get_sender_id,
                                rem_write_crd_modify_elem_after_send);
          }

          free_rem_write_slots =
              cr_move_stalled_writes_to_top_n_return_free_space(remote_writes);
        }

      } else if (!CR_ENABLE_ONLY_HEAD_REQS)
        /// Initiate Remote writes
        wings_issue_pkts(rem_writes_ud_c, NULL, (uint8_t*)ops, max_batch_size,
                         sizeof(spacetime_op_t), &rolling_idx,
                         remote_write_skip_or_get_sender_id,
                         inv_modify_elem_after_send,
                         remote_write_copy_and_modify_elem);

      ///</3rd stage>
      ///////////////

      ///////////////
      ///< 4th stage>
      if (CR_ENABLE_REMOTE_READS) {
        if (machine_id == tail_id()) {
          /// Poll Remote reads
          uint16_t remote_reads_polled = wings_poll_buff_and_post_recvs(
              rem_reads_ud_c, max_batch_size, (uint8_t*)remote_reads);

          /// Batch Remote reads to KVS
          cr_batch_ops_to_KVS(Remote_reads, (uint8_t*)remote_reads,
                              remote_reads_polled, sizeof(spacetime_op_t),
                              NULL);

          /// Issue responses of Remote reads
          wings_issue_pkts(rem_read_resp_ud_c, NULL, (uint8_t*)remote_reads,
                           remote_reads_polled, sizeof(spacetime_op_t), NULL,
                           remote_read_resp_skip_or_get_sender_id,
                           remote_read_resp_modify_elem_after_send,
                           remote_read_resp_copy_and_modify_elem);

        } else {
          /// Initiate Remote reads
          wings_issue_pkts(rem_reads_ud_c, NULL, (uint8_t*)ops, max_batch_size,
                           sizeof(spacetime_op_t), &remote_reads_rolling_idx,
                           remote_read_skip_or_get_sender_id,
                           remote_read_modify_elem_after_send,
                           remote_read_copy_and_modify_elem);

          for (int i = 0; i < max_batch_size; i++)
            assert(ops[i].op_meta.opcode == ST_OP_PUT ||
                   ops[i].op_meta.opcode == ST_OP_GET);

          /// Poll respsonses of Remote reads
          uint16_t remote_read_resps_polled = wings_poll_buff_and_post_recvs(
              rem_read_resp_ud_c, max_batch_size, (uint8_t*)remote_reads);
          /// Complete Remote reads
          cr_complete_local_reads(remote_reads, remote_read_resps_polled, ops);
          stop_latency_of_completed_reads(ops, worker_lid,
                                          &stopwatch_for_req_latency);

          for (int i = 0; i < max_batch_size; i++)
            assert(ops[i].op_meta.opcode == ST_OP_PUT ||
                   ops[i].op_meta.opcode == ST_OP_GET);
        }
      }
      ///</4th stage>
      ///////////////

      if (machine_id != head_id()) {
        /// Poll for INVs
        if (has_outstanding_invs == 0) {
          invs_polled = wings_poll_buff_and_post_recvs(inv_ud_c, coh_ops_len,
                                                       (uint8_t*)inv_recv_ops);

          if (invs_polled > 0) {
            /// Batch INVs to KVS
            cr_batch_ops_to_KVS(Invs, (uint8_t*)inv_recv_ops, invs_polled,
                                sizeof(spacetime_inv_t), ops);

            if (CR_ENABLE_EARLY_INV_CRDS)
              /// Issue credits for INVs to previous node in chain
              wings_issue_credits(inv_crd_ud_c, NULL, (uint8_t*)inv_recv_ops,
                                  invs_polled, sizeof(spacetime_inv_t),
                                  inv_crd_skip_or_get_sender_id,
                                  inv_crd_modify_elem_after_send);
          }
        }

        if (invs_polled > 0) {
          /// Batch INVs to KVS
          if (machine_id != tail_id() && machine_id != head_id())
            /// Forward INVS to next node in chain
            has_outstanding_invs = wings_issue_pkts(
                inv_ud_c, NULL, (uint8_t*)inv_recv_ops, invs_polled,
                sizeof(spacetime_inv_t), NULL, inv_skip_or_fwd_to_next_node,
                inv_fwd_modify_elem_after_send, inv_fwd_copy_and_modify_elem);

          else if (machine_id == tail_id()) {
            /// Initiate ACKS (forward to prev)
            has_outstanding_invs = wings_issue_pkts(
                ack_ud_c, NULL, (uint8_t*)inv_recv_ops, invs_polled,
                sizeof(spacetime_inv_t), NULL, ack_skip_or_get_sender_id,
                ack_modify_elem_after_send, ack_copy_and_modify_elem);
            if (ENABLE_ASSERTIONS)
              assert(ack_ud_c->stats.send_total_msgs ==
                     inv_ud_c->stats.recv_total_msgs -
                         inv_ud_c->num_overflow_msgs);
          }
        }
      }

      if (machine_id != tail_id()) {
        /// Poll for Acks
        acks_polled = wings_poll_buff_and_post_recvs(ack_ud_c, coh_ops_len,
                                                     (uint8_t*)ack_recv_ops);

        if (acks_polled > 0) {
          /// Batch ACKs to KVS
          cr_batch_ops_to_KVS(Acks, (uint8_t*)ack_recv_ops, acks_polled,
                              sizeof(spacetime_ack_t), ops);

          stop_latency_of_completed_writes(ops, worker_lid,
                                           &stopwatch_for_req_latency);
        }

        if (machine_id != head_id()) {
          /// FWD ACKs to previous node if not the Head
          wings_issue_pkts(
              ack_ud_c, NULL, (uint8_t*)ack_recv_ops, acks_polled,
              sizeof(spacetime_ack_t), NULL, ack_fwd_skip_or_get_sender_id,
              ack_fwd_modify_elem_after_send, ack_fwd_copy_and_modify_elem);
          if (ENABLE_ASSERTIONS)
            assert(ack_ud_c->stats.send_total_msgs ==
                   ack_ud_c->stats.recv_total_msgs -
                       ack_ud_c->num_overflow_msgs);

        } else  /// empty ack_rcv_ops in head node
          for (int i = 0; i < coh_ops_len; ++i)
            ack_recv_ops[i].opcode = ST_EMPTY;
      }
    }
    w_stats[worker_lid].total_loops++;
  }

  return NULL;
}


================================================
FILE: src/hades/hades.c
================================================
//
// Created by akatsarakis on 12/02/19.
//

#include "../../include/hades/hades.h"
#include <getopt.h>

typedef struct {
  hades_view_t* ctx_last_local_view;
  uint8_t dst_id;
} hades_view_wrapper_w_dst_id_t;

int
hades_skip_or_get_dst_id(uint8_t* req)
{
  return ((hades_view_wrapper_w_dst_id_t*)req)->dst_id;
}

void
hades_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)
{
  hades_view_wrapper_w_dst_id_t* last_local_view =
      (hades_view_wrapper_w_dst_id_t*)triggering_req;
  hades_view_t* send_hbt = (hades_view_t*)(msg_to_send - 1);

  *send_hbt = *last_local_view->ctx_last_local_view;
}

int
hades_crd_skip_or_get_sender_id(uint8_t* req)
{
  hades_view_t* req_hbt = (hades_view_t*)req;
  return req_hbt->node_id;  // always send crd
}

static inline void
print_send_hbt(ud_channel_t* hbeat_c, hades_ctx_t* ctx)
{
  colored_printf(YELLOW, "Send view[%lu]: {node %d, epoch_id %d} ",
                 hbeat_c->stats.send_total_msgs,
                 ctx->intermediate_local_view.node_id,
                 ctx->intermediate_local_view.epoch_id);
  bv_print_enhanced(ctx->curr_g_membership);
  printf("\n");
}

static inline void
print_recved_hbts(ud_channel_t* hbeat_c, hades_view_t* hbt_array,
                  uint16_t no_hbts)
{
  for (int i = 0; i < no_hbts; ++i) {
    colored_printf(GREEN, "Recved view[%lu]: {node %d, epoch_id %d} ",
                   hbeat_c->stats.recv_total_msgs, hbt_array[i].node_id,
                   hbt_array[i].epoch_id);
    bv_print_enhanced(hbt_array[i].view);
    printf("\n");
  }
}

static inline uint8_t
majority_of_nodes(hades_ctx_t* ctx)
{
  assert(ctx->max_num_nodes > 1);
  return (uint8_t)(ctx->max_num_nodes == 2 ? 2 : (ctx->max_num_nodes / 2) + 1);
}

static inline void
check_if_majority_is_rechable(hades_ctx_t* h_ctx)
{
  if (bv_no_setted_bits(h_ctx->last_local_view.view) >=
          majority_of_nodes(h_ctx) &&
      bv_no_setted_bits(h_ctx->intermediate_local_view.view) <
          majority_of_nodes(h_ctx)) {
    colored_printf(RED, "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n");
    colored_printf(RED, "~ [HADES WARNING]: I cannot reach a majority ! ~\n");
    colored_printf(RED, "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n");
    colored_printf(YELLOW, "Last membership (epoch %d): ",
                   h_ctx->intermediate_local_view.epoch_id);
    bv_print_enhanced(h_ctx->curr_g_membership);
    colored_printf(YELLOW, "My current view: ");
    bv_print_enhanced(h_ctx->intermediate_local_view.view);
    colored_printf(RED, "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n");
  }
}

static inline uint8_t
skip_to_apply_fake_link_failure(uint8_t node_id)
{
  static uint8_t ts_is_inited = 0;
  static uint8_t link_has_failed = 0;
  static struct timespec ts_fake_link_failure;

  if ((machine_id == FAKE_LINK_FAILURE_NODE_A &&
       node_id == FAKE_LINK_FAILURE_NODE_B) ||
      (!FAKE_ONE_WAY_LINK_FAILURE && node_id == FAKE_LINK_FAILURE_NODE_A &&
       machine_id == FAKE_LINK_FAILURE_NODE_B)) {
    if (ts_is_inited == 0) {
      get_rdtsc_timespec(&ts_fake_link_failure);
      ts_is_inited = 1;
    }

    if (time_elapsed_in_sec(ts_fake_link_failure) >
            FAKE_LINK_FAILURE_AFTER_SEC &&
        time_elapsed_in_sec(ts_fake_link_failure) <
            STOP_FAKE_LINK_FAILURE_AFTER_SEC) {
      if (link_has_failed == 0) {
        colored_printf(RED, "%sLink failure between node %d and %d\n",
                       FAKE_ONE_WAY_LINK_FAILURE ? "One-way " : "",
                       FAKE_LINK_FAILURE_NODE_A, FAKE_LINK_FAILURE_NODE_B);
        link_has_failed = 1;
      }
      return 1;
    }
  }
  return 0;
}

static inline uint8_t
is_in_membership(hades_ctx_t* h_ctx, uint8_t node_id)
{
  return bv_bit_get(h_ctx->curr_g_membership, node_id);
}

// Skip iterations for arbitration:
static inline uint8_t
skip_arbitration(hades_ctx_t* h_ctx, uint8_t i)
{
  if (i == machine_id) return 1;  // 1. my local machine id
  if (!h_ctx->recved_views_flag[i])
    return 1;  // 2. machine ids that I have not received a view
  //    if(!is_in_membership(h_ctx, i)) return 1;   // 3. machine ids that are
  //    not currently in the group membership
  if (h_ctx->remote_recved_views[i].have_ostracised_for_dst_node == 1)
    return 1;  // 3. this node has not already ostracise someone for me
  if (!bv_bit_get(h_ctx->remote_recved_views[i]
                      .view,  // 4. If my node id does not exist in their view
                  machine_id))
    return 1;
  return 0;
}

// In case of a link failure (either both or one way) between nodes A and B.
// Rest of nodes would be able to detect such a failure using its received views
// and resolve this deterministically by choosing the one with the highest node
// id to be expelled from the group membership. Once a node is voted to be
// expelled by the majority of nodes it gets removed from the membership, this
// method is inspired by the "ostracism" procedure under the Athenian democracy
// in which any citizen could be expelled from the city of Athens for ten years.

// If a node has ostracised somebody for me I cannot ostracised somebody for him
static inline void
view_arbitration_via_ostracism(hades_ctx_t* h_ctx)
{
  for (uint8_t i = 0; i < h_ctx->max_num_nodes; ++i)
    h_ctx->have_ostracized_for[i] = 0;

  for (uint8_t i = 0; i < h_ctx->max_num_nodes; ++i) {
    if (skip_arbitration(h_ctx, i)) continue;

    for (uint8_t j = 0; j < h_ctx->max_num_nodes; ++j) {
      if (i >= j) continue;  // for efficiency we do not need to check those
      if (skip_arbitration(h_ctx, j)) continue;

      uint8_t i_view_of_j = bv_bit_get(h_ctx->remote_recved_views[i].view, j);
      uint8_t j_view_of_i = bv_bit_get(h_ctx->remote_recved_views[j].view, i);

      if (i_view_of_j == 0 || j_view_of_i == 0) {
        // by default always ostracise this to the Max(i, j) --> j is always > i
        // unless it's an one way failure from the opposite side where we have
        // to ostracise i
        uint8_t node_to_ostracise = i_view_of_j == 1 ? i : j;
        uint8_t node_to_ostracised_for = i_view_of_j == 1 ? j : i;

        h_ctx->recved_views_flag[node_to_ostracise] = 0;
        h_ctx->have_ostracized_for[node_to_ostracised_for] = 1;
        bv_bit_reset(&h_ctx->intermediate_local_view.view, node_to_ostracise);

        //                yellow_printf("Ostracism: between nodes %d-%d --> %d
        //                is ostracized\n", i, j, node_to_ostracise); printf("My
        //                view: (epoch %d)\n",
        //                h_ctx->intermediate_local_view.epoch_id);
        //                bv_print_enhanced(h_ctx->intermediate_local_view.view);
      }
    }
  }
}

static inline uint8_t
get_max_received_epoch_id(hades_ctx_t* h_ctx)
{
  uint8_t max_epoch_id = 0;
  for (int i = 0; i < h_ctx->max_num_nodes; ++i)
    if (h_ctx->recved_views_flag[i] == 1 &&
        h_ctx->remote_recved_views[i].epoch_id > max_epoch_id)
      max_epoch_id = h_ctx->remote_recved_views[i].epoch_id;
  return max_epoch_id;
}

static inline void
update_view_n_membership(hades_ctx_t* h_ctx)
{
  if (time_elapsed_in_ms(h_ctx->ts_last_view_change) >
      h_ctx->update_local_view_every_ms) {
    get_rdtsc_timespec(&h_ctx->ts_last_view_change);  // Reset timer

    uint8_t views_aggreeing = 1;  // (always agree with my local view)
    uint8_t same_w_local_membership = 0;
    uint16_t max_epoch_id = h_ctx->intermediate_local_view.epoch_id;

    if (ENABLE_ARBITRATION) view_arbitration_via_ostracism(h_ctx);

    // if view has changed update ctx
    if (!bv_are_equal(h_ctx->intermediate_local_view.view,
                      h_ctx->curr_g_membership) ||
        get_max_received_epoch_id(h_ctx) >
            h_ctx->intermediate_local_view.epoch_id) {
      for (int i = 0; i < h_ctx->max_num_nodes; ++i) {
        if (i == machine_id) continue;
        if (h_ctx->recved_views_flag[i] == 0) continue;

        if (bv_are_equal(h_ctx->intermediate_local_view.view,
                         h_ctx->remote_recved_views[i].view)) {
          views_aggreeing++;
          if (max_epoch_id < h_ctx->remote_recved_views[i].epoch_id) {
            max_epoch_id = h_ctx->remote_recved_views[i].epoch_id;
            same_w_local_membership =
                h_ctx->remote_recved_views[i].same_w_local_membership;
          }
        }
        h_ctx->recved_views_flag[i] = 0;  // reset the received flag
      }

      if (views_aggreeing >= majority_of_nodes(h_ctx)) {
        h_ctx->intermediate_local_view.epoch_id =
            (uint8_t)(max_epoch_id + (same_w_local_membership == 1 ? 0 : 1));
        bv_copy(&h_ctx->curr_g_membership, h_ctx->intermediate_local_view.view);

        //                printf("Max epoch id: %d, same_w_local_membership:
        //                %d\n",
        //                        max_epoch_id, same_w_local_membership);
        colored_printf(YELLOW, "[HADES] MEMBERSHIP CHANGE --> [epoch %d], ",
                       h_ctx->intermediate_local_view.epoch_id);
        bv_print(h_ctx->curr_g_membership);
        printf("\n");
        //                bv_print_enhanced(h_ctx->curr_g_membership);
      }
    }

    check_if_majority_is_rechable(h_ctx);

    // update last local view
    h_ctx->last_local_view = h_ctx->intermediate_local_view;
    h_ctx->last_local_view.same_w_local_membership =
        bv_are_equal(h_ctx->last_local_view.view, h_ctx->curr_g_membership);

    // Reset local view
    bv_reset_all(&h_ctx->intermediate_local_view.view);
    bv_bit_set(&h_ctx->intermediate_local_view.view, (uint8_t)machine_id);
  }
}

static inline void
issue_heartbeats(hades_wings_ctx_t* hw_ctx)
{
  hades_ctx_t* h_ctx = &hw_ctx->ctx;
  hades_view_wrapper_w_dst_id_t last_local_view;

  last_local_view.ctx_last_local_view = &h_ctx->last_local_view;

  for (uint8_t i = 0; i < h_ctx->max_num_nodes; ++i) {
    h_ctx->last_local_view.have_ostracised_for_dst_node =
        h_ctx->have_ostracized_for[i];
    if (i == machine_id) continue;
    if (FAKE_LINK_FAILURE && skip_to_apply_fake_link_failure(i)) continue;

    last_local_view.dst_id = i;
    if (time_elapsed_in_us(h_ctx->ts_last_send[i]) >
        h_ctx->send_view_every_us) {
      // Reset a tmp timer in case the send fails due to not enough crds
      struct timespec ts_last_send_tmp;
      get_rdtsc_timespec(&ts_last_send_tmp);
      uint8_t send_failed = wings_issue_pkts(
          hw_ctx->hviews_c, NULL, (uint8_t*)&last_local_view, 1,
          sizeof(hades_view_wrapper_w_dst_id_t), NULL, hades_skip_or_get_dst_id,
          wings_NOP_modify_elem_after_send, hades_copy_and_modify_elem);
      if (!send_failed) h_ctx->ts_last_send[i] = ts_last_send_tmp;
      //                print_send_hbt(hw_ctx->hviews_c, h_ctx);
    }
  }
}

// static inline
void
update_view_and_issue_hbs(hades_wings_ctx_t* hw_ctx)
{
  update_view_n_membership(&hw_ctx->ctx);

  issue_heartbeats(hw_ctx);
}

// static inline
uint16_t
poll_for_remote_views(hades_wings_ctx_t* hw_ctx)
{
  hades_ctx_t* h_ctx = &hw_ctx->ctx;

  // Poll for membership send
  uint16_t views_polled = wings_poll_buff_and_post_recvs(
      hw_ctx->hviews_c, h_ctx->max_views_to_poll, (uint8_t*)h_ctx->poll_buff);

  //    print_recved_hbts(hw_ctx->hviews_c, h_ctx->poll_buff, views_polled);

  for (int i = 0; i < views_polled; ++i) {
    uint8_t sender_id = h_ctx->poll_buff[i].node_id;
    h_ctx->recved_views_flag[sender_id] = 1;
    h_ctx->remote_recved_views[sender_id] = h_ctx->poll_buff[i];
    bv_bit_set(&h_ctx->intermediate_local_view.view, sender_id);

    // In case somebody tries to rejoin
    if (h_ctx->last_local_view.epoch_id > 1)
      if (h_ctx->poll_buff[i].epoch_id == 0 &&
          hw_ctx->hviews_c->credits_per_channels[sender_id] == 0) {
        /// Need to reset its credits and reconfigure the qps to start sending
        /// views again Warning: currently we share qp info via memcache so if
        /// node storing memcache (e.g. houston)
        ///         fails we cannot make him re-join (prev qp info are lost)
        printf("Resetting credits and reconfiguring ibv_qps for channel: %d\n",
               sender_id);
        wings_reset_credits(hw_ctx->hviews_c, sender_id);
        wings_reconfigure_wrs_ah(hw_ctx->hviews_c, sender_id);
      }
  }

  wings_issue_credits(hw_ctx->hviews_crd_c, NULL, (uint8_t*)h_ctx->poll_buff,
                      views_polled, sizeof(hades_view_t),
                      hades_crd_skip_or_get_sender_id,
                      wings_NOP_modify_elem_after_send);

  return views_polled;
}

void*
hades_loop_only_thread(void* hades_wings_ctx)
{
  hades_wings_ctx_t* hw_ctx = hades_wings_ctx;

  uint64_t no_iters = 0;
  while (true) {
    /// Print every X iteration (Mainly for dbging)
    no_iters++;
    if (no_iters % M_32 == 0) {
      //            printf("My view: (epoch %d)\n",
      //            hw_ctx->ctx.intermediate_local_view.epoch_id);
      //            bv_print_enhanced(hw_ctx->ctx.intermediate_local_view.view);
    }

    /// Main loop
    update_view_and_issue_hbs(hw_ctx);

    poll_for_remote_views(hw_ctx);
  }
}

void*
hades_full_thread(void* node_id)
{
  //////////////////////////////////
  /// failure detector context init
  //////////////////////////////////

  /// Wings (rdma communication) init
  ud_channel_t* ud_c_ptrs[2];
  ud_channel_t ud_channels[2];

  for (int i = 0; i < 2; ++i)
    ud_c_ptrs[i] = &ud_channels[i];

  ud_channel_t* hviews_c = ud_c_ptrs[0];
  ud_channel_t* hviews_crd_c = ud_c_ptrs[1];

  // other Vars
  uint8_t machine_num = 3;
  uint16_t worker_lid = 0;
  uint16_t max_views_to_poll = 10;
  uint32_t send_view_every_us = 100;
  uint32_t update_local_view_ms = 10;

  uint8_t _node_id = *((uint8_t*)node_id);

  hades_wings_ctx_t w_ctx;
  hades_wings_ctx_init(&w_ctx, _node_id, machine_num, max_views_to_poll,
                       send_view_every_us, update_local_view_ms, hviews_c,
                       hviews_crd_c, worker_lid);

  wings_setup_channel_qps_and_recvs(ud_c_ptrs, 2, NULL, 0);

  hades_loop_only_thread(&w_ctx);

  return NULL;
}


================================================
FILE: src/hades/test.c
================================================
//
// Created by akatsarakis on 21/05/19.
//

#include <getopt.h>
#include "../../include/hades/hades.h"

int
main(int argc, char* argv[])
{
  machine_id = -1;

  static struct option opts[] = {
      {.name = "machine-id", .has_arg = 1, .val = 'm'},
      {.name = "dev-name", .has_arg = 1, .val = 'd'},
      {0}};

  /* Parse and check arguments */
  while (1) {
    int c = getopt_long(argc, argv, "m:d:", opts, NULL);
    if (c == -1) {
      break;
    }
    switch (c) {
      case 'm':
        machine_id = atoi(optarg);
        break;
      case 'd':
        memcpy(dev_name, optarg, strlen(optarg));
        break;
      default:
        printf("Invalid argument %d\n", c);
        assert(false);
    }
  }

  hades_full_thread(&machine_id);
}


================================================
FILE: src/hermes/hermesKV.c
================================================
//
// Created by akatsarakis on 07/03/19.
//

#include <inline-util.h>
#include <spacetime.h>

//////////////////////////////////////////////////
/////////////////////// HERMES KVS (SPACETIME)
//////////////////////////////////////////////////

//////////// Assertion functions

static inline void
hermes_assertions_begin_inv(spacetime_inv_t* inv_ptr)
{
  assert(inv_ptr->op_meta.ts.version % 2 == 0);
  assert(inv_ptr->op_meta.opcode == ST_OP_INV ||
         inv_ptr->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE);
  assert(inv_ptr->op_meta.val_len == (ST_VALUE_SIZE >> SHIFT_BITS));
  assert(remote_machine_num != 1 ||
         inv_ptr->op_meta.sender == remote_machine_num - machine_id);
  assert(remote_machine_num != 1 ||
         inv_ptr->op_meta.ts.tie_breaker_id == remote_machine_num - machine_id);
  //			red_printf("INVs: Ops[%d]vvv hash(1st 8B):%" PRIu64 "
  // version: %d, tie: %d\n", I,
  //					   ((uint64_t *) &(*op)[I].key)[0],
  //(*op)[I].version,
  //(*op)[I].tie_breaker_id);
}

static inline void
hermes_assertions_begin_ack(spacetime_ack_t* ack_ptr)
{
  assert(ack_ptr->ts.version % 2 == 0);
  assert(remote_machine_num != 1 ||
         ack_ptr->sender == remote_machine_num - machine_id);
  assert(ack_ptr->opcode == ST_OP_ACK || ack_ptr->opcode == ST_OP_INV_ABORT ||
         ack_ptr->opcode == ST_OP_MEMBERSHIP_CHANGE);

  /// WARNING the following assertion is incorrect for write replays
  //	assert(group_membership.num_of_alive_remotes != MAX_REMOTE_MACHINES ||
  //	       ack_ptr->opcode == ST_OP_INV_ABORT ||
  //		   ack_ptr->ts.tie_breaker_id == machine_id ||
  //		   (ENABLE_VIRTUAL_NODE_IDS && ack_ptr->ts.tie_breaker_id  %
  // MAX_MACHINE_NUM == machine_id));

  //			yellow_printf("ACKS: Ops[%d]vvv hash(1st 8B):%" PRIu64 "
  // version: %d, tie: %d\n", I,
  //					   ((uint64_t *) &(*op)[I].key)[0],
  //(*op)[I].version,
  //(*op)[I].tie_breaker_id);
}

static inline void
hermes_assertions_begin_val(spacetime_val_t* val_ptr)
{
  assert(val_ptr->ts.version % 2 == 0);
  assert(val_ptr->opcode == ST_OP_VAL);
  assert(remote_machine_num != 1 ||
         val_ptr->sender == remote_machine_num - machine_id);
  assert(remote_machine_num != 1 ||
         val_ptr->ts.tie_breaker_id == remote_machine_num - machine_id);
  //			green_printf("VALS: Ops[%d]vvv hash(1st 8B):%" PRIu64 "
  // version: %d, tie: %d\n", I,
  //					   ((uint64_t *) &(*op)[I].key)[0],
  //(*op)[I].version,
  //(*op)[I].tie_breaker_id);
}

static inline void
hermes_assertions_end_read_write_ops(spacetime_op_t* read_write_op)
{
  for (int i = 0; i < max_batch_size; ++i)
    assert(read_write_op[i].op_meta.opcode == ST_OP_GET ||
           read_write_op[i].op_meta.state == ST_MISS ||
           read_write_op[i].op_meta.state == ST_PUT_STALL ||
           read_write_op[i].op_meta.state == ST_PUT_SUCCESS ||
           read_write_op[i].op_meta.state == ST_PUT_COMPLETE ||
           read_write_op[i].op_meta.state == ST_IN_PROGRESS_PUT ||
           read_write_op[i].op_meta.state == ST_RMW_STALL ||
           read_write_op[i].op_meta.state == ST_RMW_ABORT ||
           read_write_op[i].op_meta.state == ST_RMW_SUCCESS ||
           read_write_op[i].op_meta.state == ST_RMW_COMPLETE ||
           read_write_op[i].op_meta.state == ST_IN_PROGRESS_RMW ||
           read_write_op[i].op_meta.state ==
               ST_OP_MEMBERSHIP_CHANGE ||  /// TODO check this
           read_write_op[i].op_meta.state == ST_IN_PROGRESS_REPLAY);
}

/// Helper functions

// TODO inlining this function by hand can give higher xPut ~5% on 20% write
// rate
static inline __attribute__((always_inline)) void
hermes_lock_free_read_obj_meta(spacetime_object_meta* lock_free_read_meta,
                               spacetime_object_meta* curr_meta)
{
  uint32_t debug_cntr = 0;
  do {  // Lock free read of keys meta
    if (ENABLE_ASSERTIONS) {
      debug_cntr++;
      if (debug_cntr == M_4) {
        printf("Worker stuck on a lock-free read (for ACK)\n");
        debug_cntr = 0;
      }
    }
    *lock_free_read_meta = *curr_meta;
  } while (!cctrl_timestamp_is_same_and_valid(&lock_free_read_meta->cctrl,
                                              &curr_meta->cctrl));
}

static uint64_t g_seed = 0xdeadbeef;
static inline void
hermes_update_actions_n_unlock(spacetime_op_t* op_ptr, struct mica_op* kv_ptr,
                               spacetime_object_meta* curr_meta, uint8_t idx,
                               spacetime_group_membership curr_membership,
                               uint8_t RMW_flag)
{
  if (ENABLE_ASSERTIONS) {
    assert(RMW_flag == 0 || ENABLE_RMWs);
    assert(idx < ST_OP_BUFFER_INDEX_EMPTY);
  }

  /// Copy value and update len
  uint8_t* kv_value_ptr = (uint8_t*)&curr_meta[1];
  memcpy(kv_value_ptr, op_ptr->value, ST_VALUE_SIZE);
  kv_ptr->val_len = set_val_len(&op_ptr->op_meta);

  /// update keys metadata and unlock
  curr_meta->RMW_flag = RMW_flag;
  curr_meta->state = WRITE_STATE;
  curr_meta->op_buffer_index = (uint8_t)idx;
  curr_meta->last_local_write_ts.version =
      curr_meta->cctrl.ts.version + (!ENABLE_RMWs || RMW_flag == 1 ? 1 : 3);
  // update group membership mask
  bv_copy((bit_vector_t*)&curr_meta->ack_bv, curr_membership.w_ack_init);

  uint8_t v_node_id =
      (uint8_t)(!ENABLE_VIRTUAL_NODE_IDS
                    ? machine_id
                    : machine_id + machine_num * (hrd_fastrand(&g_seed) %
                                                  VIRTUAL_NODE_IDS_PER_NODE));
  curr_meta->last_local_write_ts.tie_breaker_id = v_node_id;

  if (!ENABLE_RMWs || RMW_flag == 1)
    cctrl_unlock_inc_version(&curr_meta->cctrl, v_node_id,
                             (uint32_t*)&(op_ptr->op_meta.ts.version));
  else
    cctrl_unlock_inc_version_by_three(&curr_meta->cctrl, v_node_id,
                                      (uint32_t*)&(op_ptr->op_meta.ts.version));

  /// update op_ptr metadata
  op_ptr->RMW_flag = RMW_flag;
  op_ptr->op_meta.state = RMW_flag == 1 ? ST_RMW_SUCCESS : ST_PUT_SUCCESS;
  op_ptr->op_meta.ts.tie_breaker_id = v_node_id;
}

static inline void
hermes_local_state_to_op(spacetime_op_t* op_ptr,
                         spacetime_object_meta* keys_meta)
{
  uint8_t* kv_value_ptr = (uint8_t*)&keys_meta[1];
  op_ptr->RMW_flag = keys_meta->RMW_flag;
  op_ptr->op_meta.state = ST_REPLAY_SUCCESS;
  op_ptr->op_meta.ts.version = keys_meta->cctrl.ts.version - 1;
  op_ptr->op_meta.ts.tie_breaker_id = keys_meta->cctrl.ts.tie_breaker_id;
  op_ptr->op_meta.val_len = ST_VALUE_SIZE >> SHIFT_BITS;
  memcpy(op_ptr->value, kv_value_ptr, ST_VALUE_SIZE);
}

static inline void
hermes_write_replay_actions(spacetime_op_t* op_ptr, uint8_t idx,
                            spacetime_object_meta* keys_meta,
                            spacetime_group_membership curr_membership)
{
  if (ENABLE_ASSERTIONS) assert(idx < ST_OP_BUFFER_INDEX_EMPTY);

  colored_printf(YELLOW, "Write replay for i: %d\n", idx);

  /// update keys metadata and unlock
  keys_meta->state = REPLAY_STATE;
  keys_meta->op_buffer_index = (uint8_t)idx;
  keys_meta->last_local_write_ts.version = keys_meta->cctrl.ts.version - 1;
  keys_meta->last_local_write_ts.tie_breaker_id =
      keys_meta->cctrl.ts.tie_breaker_id;
  // update group membership mask for replay acks
  bv_copy((bit_vector_t*)&keys_meta->ack_bv, curr_membership.w_ack_init);

  /// update op_ptr metadata
  hermes_local_state_to_op(op_ptr, keys_meta);
}

static inline void
hermes_check_membership_n_write_replay_actions(
    spacetime_op_t* op_ptr, uint8_t idx, spacetime_object_meta* keys_meta,
    spacetime_group_membership curr_membership)
{
  uint8_t node_id = (uint8_t)(!ENABLE_VIRTUAL_NODE_IDS
                                  ? keys_meta->last_writer_id
                                  : keys_meta->last_writer_id % machine_num);

  if (node_is_in_membership(curr_membership, node_id))
    op_ptr->op_meta.state = ST_GET_STALL;

  else if (keys_meta->op_buffer_index == ST_OP_BUFFER_INDEX_EMPTY)
    /// stall replay: until all acks from last write arrive
    /// on multiple threads we can't complete writes / replays on VAL
    hermes_write_replay_actions(op_ptr, idx, keys_meta, curr_membership);
}

static inline void
hermes_marshal_write_coalesce_optimization(spacetime_op_t* op_ptr,
                                           uint16_t curr_ts_version)
{
  if (ENABLE_ASSERTIONS) assert(op_ptr->op_meta.opcode == ST_OP_PUT);

  if (ENABLE_WRITE_COALESCE_TO_THE_SAME_KEY_IN_SAME_NODE &&
      op_ptr->op_meta.ts.version == 0) {
    // if its the first time we stall on this read store the timestamp
    op_ptr->op_meta.ts.version = curr_ts_version;
    op_ptr->op_meta.state = ST_IN_PROGRESS_PUT;
  }
}

static inline void
hermes_complete_coalesced_write(spacetime_op_t* op_ptr, uint16_t curr_ts)
{
  if (ENABLE_ASSERTIONS) assert(op_ptr->op_meta.opcode == ST_OP_PUT);

  if (ENABLE_WRITE_COALESCE_TO_THE_SAME_KEY_IN_SAME_NODE &&
      op_ptr->op_meta.state == ST_PUT_STALL)
    if (op_ptr->op_meta.ts.version > 0 &&
        op_ptr->op_meta.ts.version + 1 < curr_ts) {
      // if the timestamp we saw initially has smaller than 2 versions it means
      // that the local write we coalesced with is completed
      op_ptr->op_meta.state = ST_PUT_COMPLETE;
    }
}

static inline void
hermes_complete_hot_read_optimization(spacetime_op_t* op_ptr, timestamp_t ts)
{
  if (ENABLE_READ_COMPLETE_AFTER_VAL_RECV_OF_HOT_REQS &&
      op_ptr->op_meta.state == ST_GET_STALL) {
    if (op_ptr->op_meta.ts.version == 0 &&
        op_ptr->op_meta.ts.tie_breaker_id == 0) {
      // if its the first time we stall on this read store the timestamp
      op_ptr->op_meta.ts.version = ts.version;
      op_ptr->op_meta.ts.tie_breaker_id = ts.tie_breaker_id;

    } else if (op_ptr->op_meta.ts.version + 1 < ts.version) {
      // if the timestamp we saw initially has smaller than 2 versions complete
      // the read;
      // TODO we also need to get the value here
      op_ptr->op_meta.state = ST_GET_COMPLETE;
    }
  }
}

static inline void
hermes_read_actions(spacetime_op_t* op_ptr, struct mica_op* kv_ptr,
                    uint8_t* kv_value_ptr)
{
  memcpy(op_ptr->value, kv_value_ptr, ST_VALUE_SIZE);
  op_ptr->op_meta.state = ST_GET_COMPLETE;
  op_ptr->op_meta.val_len = get_val_len(kv_ptr);
}

//////////// Exec op functions
static inline void
hermes_exec_read(spacetime_op_t* op_ptr, struct mica_op* kv_ptr, uint8_t idx,
                 spacetime_group_membership curr_membership)
{
  if (ENABLE_ASSERTIONS) assert(op_ptr->op_meta.opcode == ST_OP_GET);

  timestamp_t curr_ts;
  spacetime_object_meta prev_meta;
  spacetime_object_meta* keys_meta = (spacetime_object_meta*)kv_ptr->value;
  uint8_t* kv_value_ptr = (uint8_t*)&keys_meta[1];

  // Lock free reads through versioning (successful when version is even)
  uint8_t was_locked_read = 0;
  op_ptr->op_meta.state = ST_EMPTY;
  do {
    prev_meta = *keys_meta;
    curr_ts = keys_meta->cctrl.ts;
    // switch template with all states
    switch (keys_meta->state) {
      case VALID_STATE:
        hermes_read_actions(op_ptr, kv_ptr, kv_value_ptr);
        break;

      case INVALID_WRITE_STATE:
      case WRITE_STATE:
      case REPLAY_STATE:
        op_ptr->op_meta.state = ST_GET_STALL;
        break;

      default:
        was_locked_read = 1;
        cctrl_lock(&keys_meta->cctrl);
        curr_ts = keys_meta->cctrl.ts;
        curr_ts.version -= 1;  // WARNING: when locking we do version++

        switch (keys_meta->state) {
          case VALID_STATE:
            hermes_read_actions(op_ptr, kv_ptr, kv_value_ptr);
            break;

          case INVALID_WRITE_STATE:
          case WRITE_STATE:
          case REPLAY_STATE:
            op_ptr->op_meta.state = ST_GET_STALL;
            break;

          case INVALID_STATE:
            hermes_check_membership_n_write_replay_actions(
                op_ptr, idx, keys_meta, curr_membership);
            break;

          default:
            assert(0);
        }
        cctrl_unlock_dec_version(&keys_meta->cctrl);
        break;
    }
  } while (
      !cctrl_timestamp_is_same_and_valid(&prev_meta.cctrl, &keys_meta->cctrl) &&
      was_locked_read == 0);

  hermes_complete_hot_read_optimization(op_ptr, curr_ts);
}

static inline void
hermes_exec_write(spacetime_op_t* op_ptr, struct mica_op* kv_ptr, uint8_t idx,
                  spacetime_group_membership curr_membership)
{
  if (ENABLE_ASSERTIONS) {
    assert(op_ptr->op_meta.opcode == ST_OP_PUT);
    assert(op_ptr->op_meta.val_len == (ST_VALUE_SIZE >> SHIFT_BITS));
  }

  spacetime_object_meta* keys_meta = (spacetime_object_meta*)kv_ptr->value;

  op_ptr->op_meta.state = ST_EMPTY;
  cctrl_lock(&keys_meta->cctrl);
  uint16_t curr_version = (uint16_t)(keys_meta->cctrl.ts.version - 1);
  switch (keys_meta->state) {
    case VALID_STATE:
    case INVALID_STATE:
      if (keys_meta->op_buffer_index != ST_OP_BUFFER_INDEX_EMPTY) {
        /// stall write: until all acks from last write arrive
        /// on multiple threads we can't complete writes / replays on VAL
        cctrl_unlock_dec_version(&keys_meta->cctrl);
        hermes_marshal_write_coalesce_optimization(op_ptr, curr_version);

      } else
        hermes_update_actions_n_unlock(op_ptr, kv_ptr, keys_meta, idx,
                                       curr_membership, 0);
      break;

    case INVALID_WRITE_STATE:
    case WRITE_STATE:
      hermes_marshal_write_coalesce_optimization(op_ptr, curr_version);
    case REPLAY_STATE:
      cctrl_unlock_dec_version(&keys_meta->cctrl);
      break;
    default:
      assert(0);
  }

  // Fill this deterministic stuff after releasing the lock
  if (op_ptr->op_meta.state != ST_PUT_SUCCESS)
    op_ptr->op_meta.state = ST_PUT_STALL;

  hermes_complete_coalesced_write(op_ptr, curr_version);
}

static inline void
hermes_exec_rmw(spacetime_op_t* op_ptr, struct mica_op* kv_ptr, uint8_t idx,
                spacetime_group_membership curr_membership)
{
  spacetime_object_meta* keys_meta = (spacetime_object_meta*)kv_ptr->value;

  if (ENABLE_ASSERTIONS) {
    assert(op_ptr->op_meta.opcode == ST_OP_RMW);
    assert(op_ptr->op_meta.state == ST_NEW ||
           op_ptr->op_meta.state == ST_RMW_STALL ||
           op_ptr->op_meta.state == ST_IN_PROGRESS_RMW);
    assert(op_ptr->op_meta.val_len == (ST_VALUE_SIZE >> SHIFT_BITS));
  }

  if (op_ptr->op_meta.state == ST_IN_PROGRESS_RMW) {
    spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;
    spacetime_object_meta lock_free_meta;
    hermes_lock_free_read_obj_meta(&lock_free_meta, curr_meta);
    if (timestamp_is_smaller(op_ptr->op_meta.ts.version,
                             op_ptr->op_meta.ts.tie_breaker_id,
                             lock_free_meta.cctrl.ts.version,
                             lock_free_meta.cctrl.ts.tie_breaker_id)) {
      // Abort RMW --> we saw higher TS before gathering all of its acks
      op_ptr->op_meta.state = ST_RMW_ABORT;
      cctrl_lock(&keys_meta->cctrl);
      if (timestamp_is_equal(
              op_ptr->op_meta.ts.version, op_ptr->op_meta.ts.tie_breaker_id,
              lock_free_meta.last_local_write_ts.version,
              lock_free_meta.last_local_write_ts.tie_breaker_id)) {
        if (ENABLE_ASSERTIONS) assert(idx == curr_meta->op_buffer_index);
        curr_meta->op_buffer_index = ST_OP_BUFFER_INDEX_EMPTY;
      }
      cctrl_unlock_dec_version(&keys_meta->cctrl);
    }
  } else {
    op_ptr->op_meta.state = ST_EMPTY;

    cctrl_lock(&keys_meta->cctrl);

    switch (keys_meta->state) {
      case VALID_STATE:
        if (keys_meta->op_buffer_index != ST_OP_BUFFER_INDEX_EMPTY)
          /// stall write: until all acks from last write arrive
          /// on multiple threads we can't complete writes / replays on VAL
          cctrl_unlock_dec_version(&keys_meta->cctrl);
        else
          hermes_update_actions_n_unlock(op_ptr, kv_ptr, keys_meta, idx,
                                         curr_membership, 1);
        break;

      case INVALID_STATE:
        hermes_check_membership_n_write_replay_actions(op_ptr, idx, keys_meta,
                                                       curr_membership);
        // Warning: Do not break
      case INVALID_WRITE_STATE:
      case WRITE_STATE:
      case REPLAY_STATE:
        cctrl_unlock_dec_version(&keys_meta->cctrl);
        break;
      default:
        assert(0);
        break;
    }

    // Fill this deterministic stuff after releasing the lock
    if (op_ptr->op_meta.state != ST_RMW_SUCCESS &&
        op_ptr->op_meta.state != ST_REPLAY_SUCCESS)
      op_ptr->op_meta.state = ST_RMW_STALL;
  }
}

static inline void
hermes_exec_check_update_completion(spacetime_op_t* op_ptr,
                                    struct mica_op* kv_ptr, uint8_t idx,
                                    spacetime_group_membership curr_membership)
{
  spacetime_object_meta lock_free_read_meta;
  spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;
  hermes_lock_free_read_obj_meta(&lock_free_read_meta, curr_meta);

  if (ENABLE_ASSERTIONS) {
    assert(op_ptr->op_meta.opcode == ST_OP_PUT ||
           op_ptr->op_meta.opcode == ST_OP_RMW ||
           op_ptr->op_meta.state == ST_IN_PROGRESS_REPLAY);

    assert(!timestamp_is_smaller(lock_free_read_meta.cctrl.ts.version,
                                 lock_free_read_meta.cctrl.ts.tie_breaker_id,
                                 op_ptr->op_meta.ts.version,
                                 op_ptr->op_meta.ts.tie_breaker_id));
  }

  if (is_last_ack(lock_free_read_meta.ack_bv,
                  curr_membership)) {  // if last local write completed
    cctrl_lock(&curr_meta->cctrl);
    if (is_last_ack(curr_meta->ack_bv, curr_membership)) {
      if (ENABLE_ASSERTIONS) assert(curr_meta->op_buffer_index == idx);
      curr_meta->op_buffer_index =
          ST_OP_BUFFER_INDEX_EMPTY;  // reset the write buff index
      switch (curr_meta->state) {
        case INVALID_WRITE_STATE:
          curr_meta->state = INVALID_STATE;
          /// Warning break omitted intentionally
        case VALID_STATE:
        case INVALID_STATE:
          op_ptr->op_meta.state = op_ptr->op_meta.opcode == ST_OP_PUT
                                      ? ST_PUT_COMPLETE
                                      : ST_RMW_COMPLETE;
          break;
        case WRITE_STATE:
        case REPLAY_STATE:
          op_ptr->op_meta.ts.version =
              curr_meta->cctrl.ts.version -
              1;  // -1 because of seqlock does version + 1
          op_ptr->op_meta.ts.tie_breaker_id =
              curr_meta->cctrl.ts.tie_breaker_id;
          if (curr_meta->state == WRITE_STATE) {
            op_ptr->op_meta.state = op_ptr->op_meta.opcode == ST_OP_PUT
                                        ? ST_PUT_COMPLETE_SEND_VALS
                                        : ST_RMW_COMPLETE_SEND_VALS;
          } else {
            if (ENABLE_ASSERTIONS)
              assert(op_ptr->op_meta.state == ST_IN_PROGRESS_REPLAY);
            op_ptr->op_meta.state = DISABLE_VALS_FOR_DEBUGGING == 1
                                        ? ST_GET_COMPLETE
                                        : ST_REPLAY_COMPLETE_SEND_VALS;
          }
          curr_meta->state = VALID_STATE;
          break;
        default:
          assert(0);
      }
    }
    cctrl_unlock_dec_version(&curr_meta->cctrl);
  }
}

//////////// Exec protocol action functions
static inline void
hermes_exec_inv(spacetime_inv_t* inv_ptr, struct mica_op* kv_ptr,
                spacetime_op_t* read_write_op)
{
  if (ENABLE_ASSERTIONS)
    assert(inv_ptr->op_meta.opcode == ST_OP_INV ||
           inv_ptr->op_meta.opcode == ST_OP_INV_ABORT);

  spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;
  uint8_t* kv_value_ptr = (uint8_t*)&curr_meta[1];
  spacetime_object_meta lock_free_meta;
  hermes_lock_free_read_obj_meta(&lock_free_meta, curr_meta);

  // proceed iff remote.TS >= local.TS || inv is for an RMW to respond with an
  // INV-abort
  if (!timestamp_is_smaller(inv_ptr->op_meta.ts.version,
                            inv_ptr->op_meta.ts.tie_breaker_id,
                            lock_free_meta.cctrl.ts.version,
                            lock_free_meta.cctrl.ts.tie_breaker_id) ||
      (ENABLE_RMWs && inv_ptr->RMW_flag == 1)) {
    // Lock and check again if inv TS > local timestamp
    cctrl_lock(&curr_meta->cctrl);
    /// Warning: use curr_meta->ts.version - 1 bellow since seqlock increases
    /// curr_meta->ts.version by 1
    if (timestamp_is_smaller(
            curr_meta->cctrl.ts.version - 1, curr_meta->cctrl.ts.tie_breaker_id,
            inv_ptr->op_meta.ts.version, inv_ptr->op_meta.ts.tie_breaker_id)) {
      //			printf("Received an invalidation with >=
      // timestamp\n");
      /// Update state
      switch (curr_meta->state) {
        case VALID_STATE:
          curr_meta->state = INVALID_STATE;
        case INVALID_STATE:
        case INVALID_WRITE_STATE:
          break;

        case WRITE_STATE:
        case REPLAY_STATE:
          curr_meta->state = ENABLE_RMWs && curr_meta->RMW_flag == 1
                                 ? INVALID_STATE
                                 : INVALID_WRITE_STATE;
          break;
          //				case REPLAY_STATE:
          //					curr_meta->state =
          // INVALID_WRITE_STATE;
          // curr_meta->state = INVALID_STATE;
          //					//recover the read
          //					if(ENABLE_ASSERTIONS){
          //						assert(curr_meta->op_buffer_index
          //!= ST_OP_BUFFER_INDEX_EMPTY);
          //						assert(read_write_op[curr_meta->op_buffer_index].state
          //== ST_IN_PROGRESS_REPLAY);
          // assert(((uint64_t
          //*) &read_write_op[curr_meta->op_buffer_index].key)[0] == ((uint64_t
          //*)
          //&(*op)[I].key)[0]);
          //					}
          //					read_write_op[curr_meta->op_buffer_index].state
          //= ST_NEW; curr_meta->op_buffer_index = ST_OP_BUFFER_INDEX_EMPTY;
          // break;
        default:
          assert(0);
      }

      if (ENABLE_ASSERTIONS)
        assert(inv_ptr->op_meta.val_len == (ST_VALUE_SIZE >> SHIFT_BITS));

      /// Update Value, TS, RMW_flag and last_writer_id
      kv_ptr->val_len = KVS_VALUE_SIZE;
      curr_meta->RMW_flag = inv_ptr->RMW_flag;
      curr_meta->last_writer_id = inv_ptr->op_meta.sender;
      memcpy(kv_value_ptr, inv_ptr->value, ST_VALUE_SIZE);

      cctrl_unlock_custom_version(&curr_meta->cctrl,
                                  inv_ptr->op_meta.ts.tie_breaker_id,
                                  inv_ptr->op_meta.ts.version);

    } else if (timestamp_is_equal(curr_meta->cctrl.ts.version - 1,
                                  curr_meta->cctrl.ts.tie_breaker_id,
                                  inv_ptr->op_meta.ts.version,
                                  inv_ptr->op_meta.ts.tie_breaker_id)) {
      if (curr_meta->state == WRITE_STATE)
        inv_ptr->op_meta.opcode = ST_INV_OUT_OF_GROUP;

      curr_meta->last_writer_id = inv_ptr->op_meta.sender;
      cctrl_unlock_custom_version(&curr_meta->cctrl,
                                  inv_ptr->op_meta.ts.tie_breaker_id,
                                  inv_ptr->op_meta.ts.version);

    } else {  // TS is Smaller
      /// Respond with an inv-abort if its an RMW
      if (ENABLE_RMWs && inv_ptr->RMW_flag == 1) {
        uint8_t sender_id = inv_ptr->op_meta.sender;
        hermes_local_state_to_op(inv_ptr, curr_meta);
        inv_ptr->op_meta.sender = sender_id;
        inv_ptr->op_meta.opcode = ST_OP_INV_ABORT;
        colored_printf(RED, "Sending OP_INV_ABORT\n");
      }
      cctrl_unlock_dec_version(&curr_meta->cctrl);
    }
  }

  if (inv_ptr->op_meta.opcode != ST_OP_INV_ABORT &&
      inv_ptr->op_meta.opcode != ST_INV_OUT_OF_GROUP)
    inv_ptr->op_meta.opcode = ST_INV_SUCCESS;

  if (ENABLE_ASSERTIONS)
    assert(inv_ptr->op_meta.opcode == ST_OP_INV_ABORT ||
           inv_ptr->op_meta.opcode == ST_INV_SUCCESS ||
           inv_ptr->op_meta.opcode == ST_INV_OUT_OF_GROUP);
}

static inline void
hermes_exec_ack(spacetime_ack_t* ack_ptr, struct mica_op* kv_ptr,
                spacetime_group_membership curr_membership,
                spacetime_op_t* read_write_op)
{
  int op_buff_indx = ST_OP_BUFFER_INDEX_EMPTY;
  spacetime_object_meta lock_free_read_meta;
  spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;
  hermes_lock_free_read_obj_meta(&lock_free_read_meta, curr_meta);

  if (ENABLE_ASSERTIONS)
    assert(!timestamp_is_smaller(lock_free_read_meta.cctrl.ts.version,
                                 lock_free_read_meta.cctrl.ts.tie_breaker_id,
                                 ack_ptr->ts.version,
                                 ack_ptr->ts.tie_breaker_id));

  if (timestamp_is_equal(
          ack_ptr->ts.version, ack_ptr->ts.tie_breaker_id,
          lock_free_read_meta.last_local_write_ts.version,
          lock_free_read_meta.last_local_write_ts.tie_breaker_id)) {
    /// Lock and check again if ack TS == last local write
    cctrl_lock(&curr_meta->cctrl);
    if (curr_meta->op_buffer_index != ST_OP_BUFFER_INDEX_EMPTY &&
        timestamp_is_equal(ack_ptr->ts.version, ack_ptr->ts.tie_breaker_id,
                           curr_meta->last_local_write_ts.version,
                           curr_meta->last_local_write_ts.tie_breaker_id)) {
      bv_bit_set((bit_vector_t*)&curr_meta->ack_bv, ack_ptr->sender);
      if (is_last_ack(curr_meta->ack_bv,
                      curr_membership)) {  // if last local write completed
        op_buff_indx = curr_meta->op_buffer_index;
        switch (curr_meta->state) {
          case VALID_STATE:
          case INVALID_STATE:
            ack_ptr->opcode = ST_LAST_ACK_NO_BCAST_SUCCESS;
            curr_meta->op_buffer_index =
                ST_OP_BUFFER_INDEX_EMPTY;  // reset the write buff index
            break;
          case INVALID_WRITE_STATE:
            curr_meta->state = INVALID_STATE;
            ack_ptr->opcode = ST_LAST_ACK_NO_BCAST_SUCCESS;
            curr_meta->op_buffer_index =
                ST_OP_BUFFER_INDEX_EMPTY;  // reset the write buff index
            break;
          case WRITE_STATE:
          case REPLAY_STATE:
            curr_meta->state = VALID_STATE;
            ack_ptr->opcode = ST_LAST_ACK_SUCCESS;
            curr_meta->op_buffer_index =
                ST_OP_BUFFER_INDEX_EMPTY;  // reset the write buff index
            break;
          default:
            assert(0);
        }
      }
    }
    cctrl_unlock_dec_version(&curr_meta->cctrl);
  }

  if (ack_ptr->opcode == ST_LAST_ACK_SUCCESS ||
      ack_ptr->opcode == ST_LAST_ACK_NO_BCAST_SUCCESS) {
    /// completed read / write --> remove it from the ops buffer
    if (ENABLE_ASSERTIONS) {
      assert(op_buff_indx != ST_OP_BUFFER_INDEX_EMPTY);
      assert(read_write_op[op_buff_indx].op_meta.state == ST_IN_PROGRESS_PUT ||
             read_write_op[op_buff_indx].op_meta.state == ST_IN_PROGRESS_RMW ||
             read_write_op[op_buff_indx].op_meta.state ==
                 ST_OP_MEMBERSHIP_CHANGE ||
             read_write_op[op_buff_indx].op_meta.state ==
                 ST_IN_PROGRESS_REPLAY);
      assert(((uint64_t*)&read_write_op[op_buff_indx].op_meta.key)[0] ==
             ((uint64_t*)&ack_ptr->key)[0]);
    }
    switch (read_write_op[op_buff_indx].op_meta.opcode) {
      case ST_OP_GET:
        read_write_op[op_buff_indx].op_meta.state = ST_NEW;
        break;
      case ST_OP_PUT:
        read_write_op[op_buff_indx].op_meta.state = ST_PUT_COMPLETE;
        break;
      case ST_OP_RMW:
        read_write_op[op_buff_indx].op_meta.state = ST_RMW_COMPLETE;
        // TODO ad an OP to differentiate between RMW-replay and RMW complete
        break;
      default:
        assert(0);
    }
  }

  if (ack_ptr->opcode != ST_LAST_ACK_SUCCESS) ack_ptr->opcode = ST_ACK_SUCCESS;
}

static inline void
hermes_exec_val(spacetime_val_t* val_ptr, struct mica_op* kv_ptr)
{
  spacetime_object_meta lock_free_read_meta;
  spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;
  hermes_lock_free_read_obj_meta(&lock_free_read_meta, curr_meta);

  /// lock and proceed iff remote.TS == local.TS
  if (timestamp_is_equal(lock_free_read_meta.cctrl.ts.version,
                         lock_free_read_meta.cctrl.ts.tie_breaker_id,
                         val_ptr->ts.version, val_ptr->ts.tie_breaker_id)) {
    /// Lock and check again if still TS == local timestamp
    cctrl_lock(&curr_meta->cctrl);
    /// Warning: use op.version + 1 bellow since optik_lock() increases
    /// curr_meta->version by 1
    if (timestamp_is_equal(curr_meta->cctrl.ts.version - 1,
                           curr_meta->cctrl.ts.tie_breaker_id,
                           val_ptr->ts.version, val_ptr->ts.tie_breaker_id)) {
      if (ENABLE_ASSERTIONS)
        assert(curr_meta->state !=
               WRITE_STATE);  /// WARNING: this should not happen w/o this node
                              /// removed from the group
      curr_meta->state = VALID_STATE;
    }
    cctrl_unlock_dec_version(&curr_meta->cctrl);
  }
  val_ptr->opcode = ST_VAL_SUCCESS;
}

//////////// Skip functions
static inline uint8_t
hermes_skip_op(spacetime_op_t* op_ptr)
{
  return (uint8_t)((op_ptr->op_meta.state == ST_PUT_SUCCESS ||
                    op_ptr->op_meta.state == ST_RMW_SUCCESS ||
                    op_ptr->op_meta.state == ST_REPLAY_SUCCESS ||
                    op_ptr->op_meta.state == ST_IN_PROGRESS_PUT ||
                    //                       op_ptr->op_meta.state ==
                    //                       ST_IN_PROGRESS_RMW ||
                    op_ptr->op_meta.state == ST_IN_PROGRESS_REPLAY ||
                    op_ptr->op_meta.state == ST_OP_MEMBERSHIP_CHANGE ||
                    op_ptr->op_meta.state == ST_PUT_COMPLETE_SEND_VALS)
                       ? 1
                       : 0);
}

static inline uint8_t
hermes_skip_op_after_membship_change(spacetime_op_t* op_ptr)
{
  return (uint8_t)((op_ptr->op_meta.state == ST_IN_PROGRESS_PUT ||
                    op_ptr->op_meta.state == ST_IN_PROGRESS_RMW ||
                    op_ptr->op_meta.state == ST_IN_PROGRESS_REPLAY)
                       ? 0
                       : 1);
}

static inline uint8_t
hermes_skip_inv(spacetime_inv_t* inv_ptr, int* node_suspected)
{
  if (inv_ptr->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE) {
    // TODO we need to do this only on the first skip
    *node_suspected = inv_ptr->value[0];
    printf("RECEIVED NODE SUSPICION: %d\n", *node_suspected);
    return 1;
  }
  return 0;
}

static inline uint8_t
hermes_skip_ack(spacetime_ack_t* ack_ptr)
{
  return (uint8_t)((ack_ptr->state == ST_OP_MEMBERSHIP_CHANGE) ? 1 : 0);
}

//////////// Dispatcher functions

static inline uint8_t
hermes_skip_dispatcher(enum hermes_batch_type_t type, void* ptr,
                       int* node_suspected)
{
  switch (type) {
    case local_ops:
      return hermes_skip_op(ptr);
    case local_ops_after_membership_change:
      return hermes_skip_op_after_membship_change(ptr);
    case invs:
      return hermes_skip_inv(ptr, node_suspected);
    case acks:
      return hermes_skip_ack(ptr);
    case vals:
      return 0;
    default:
      assert(0);
  }
}

static inline void
hermes_assertions_begin_dispatcher(enum hermes_batch_type_t type, void* ptr)
{
  if (ENABLE_ASSERTIONS) switch (type) {
      case local_ops:
      case local_ops_after_membership_change:
        break;
      case invs:
        hermes_assertions_begin_inv(ptr);
        break;
      case acks:
        if (ENABLE_RMWs == 0)
          hermes_assertions_begin_ack(ptr);
        else {
          spacetime_ack_t* ack_ptr = ptr;
          if (ack_ptr->opcode == ST_OP_ACK)
            hermes_assertions_begin_ack(ptr);
          else if (ack_ptr->opcode == ST_OP_INV_ABORT) {
            printf("RECVED: inv abort\n");
            hermes_assertions_begin_inv(ptr);
          } else {
            printf("RECVED: %s\n", code_to_str(ack_ptr->opcode));
            assert(0);
          }
        }
        break;
      case vals:
        hermes_assertions_begin_val(ptr);
        break;
      default:
        assert(0);
    }
}

static inline void
hermes_print_dispatcher(enum hermes_batch_type_t type, int op_num,
                        uint8_t thread_id)
{
  if (ENABLE_BATCH_OP_PRINTS) switch (type) {
      case local_ops:
      case local_ops_after_membership_change:
        break;
      case invs:
        if (ENABLE_INV_PRINTS && thread_id < MAX_THREADS_TO_PRINT)
          colored_printf(RED, "[W] Batch INVs (op num: %d)!\n", thread_id,
                         op_num);
        break;
      case acks:
        if (ENABLE_ACK_PRINTS && thread_id < MAX_THREADS_TO_PRINT)
          colored_printf(RED, "[W%d] Batch ACKs (op num: %d)!\n", thread_id,
                         op_num);
        break;
      case vals:
        if (ENABLE_VAL_PRINTS && thread_id < MAX_THREADS_TO_PRINT)
          colored_printf(RED, "[W%d] Batch VALs (op num: %d)!\n", thread_id,
                         op_num);
        break;
      default:
        assert(0);
    }
}

static inline void
hermes_assertions_end_dispatcher(enum hermes_batch_type_t type,
                                 spacetime_op_t* read_write_ops)
{
  if (ENABLE_ASSERTIONS) switch (type) {
      case local_ops:
      case local_ops_after_membership_change:
      case invs:
        break;
      case acks:
        hermes_assertions_end_read_write_ops(read_write_ops);
        break;
      case vals:
        break;
      default:
        assert(0);
    }
}

static inline void
hermes_exec_dispatcher(enum hermes_batch_type_t type, void* op_ptr,
                       struct mica_op* kv_ptr,
                       spacetime_group_membership curr_membership, uint8_t idx,
                       spacetime_op_t* read_write_op)
{
  switch (type) {
    case local_ops:
      if (((spacetime_op_t*)op_ptr)->op_meta.opcode == ST_OP_GET)
        hermes_exec_read(op_ptr, kv_ptr, idx, curr_membership);
      else if (((spacetime_op_t*)op_ptr)->op_meta.opcode == ST_OP_PUT)
        hermes_exec_write(op_ptr, kv_ptr, idx, curr_membership);
      else if (ENABLE_RMWs &&
               ((spacetime_op_t*)op_ptr)->op_meta.opcode == ST_OP_RMW)
        hermes_exec_rmw(op_ptr, kv_ptr, idx, curr_membership);
      else {
        printf("Ops[%d]: %s\n", idx,
               code_to_str(((spacetime_op_t*)op_ptr)->op_meta.opcode));
        assert(0);
      }
      break;
    case local_ops_after_membership_change:
      if (((spacetime_op_t*)op_ptr)->op_meta.opcode == ST_OP_PUT ||
          ((spacetime_op_t*)op_ptr)->op_meta.opcode == ST_OP_RMW ||
          ((spacetime_op_t*)op_ptr)->op_meta.state == ST_IN_PROGRESS_REPLAY) {
        hermes_exec_check_update_completion(op_ptr, kv_ptr, idx,
                                            curr_membership);
      } else
        assert(0);
      break;
    case invs:
      hermes_exec_inv(op_ptr, kv_ptr, read_write_op);
      break;
    case acks:
      if (ENABLE_RMWs == 0)
        hermes_exec_ack(op_ptr, kv_ptr, curr_membership, read_write_op);
      else {
        spacetime_ack_t* ack_ptr = op_ptr;
        if (ack_ptr->opcode == ST_OP_ACK)
          hermes_exec_ack(op_ptr, kv_ptr, curr_membership, read_write_op);
        else if (ack_ptr->opcode == ST_OP_INV_ABORT) {
          /// TODO RMW debugging
          printf("RECVED: inv abort\n");
          hermes_exec_inv(op_ptr, kv_ptr, read_write_op);
          ack_ptr->opcode = ST_ACK_SUCCESS;
        } else
          assert(0);
      }
      break;
    case vals:
      hermes_exec_val(op_ptr, kv_ptr);
      break;
    default:
      assert(0);
  }
}

//////////////////////////////////////////////
//////////// Main HermesKV function
//////////////////////////////////////////////

void
hermes_batch_ops_to_KVS(enum hermes_batch_type_t type, uint8_t* op_array,
                        int op_num, uint16_t sizeof_op_elem,
                        spacetime_group_membership curr_membership,
                        int* node_suspected, spacetime_op_t* read_write_ops,
                        uint8_t thread_id)
{
#if SPACETIME_DEBUG == 1
  // assert(kv.hash_table != NULL);
  assert(op_array != NULL);
  assert(op_num > 0 && op_num <= CACHE_BATCH_SIZE);
  assert(resp != NULL);
#endif

#if SPACETIME_DEBUG == 2
  for (I = 0; I < op_num; I++)
    mica_print_op(&(*op_array)[I]);
#endif
  int key_in_store[HERMES_MAX_BATCH_SIZE];  // Is this key in the datastore?
  unsigned int tag[HERMES_MAX_BATCH_SIZE];
  uint64_t bkt[HERMES_MAX_BATCH_SIZE];
  struct mica_bkt* bkt_ptr[HERMES_MAX_BATCH_SIZE];
  struct mica_op* kv_ptr[HERMES_MAX_BATCH_SIZE];  // Ptr to KV item in log

  if (ENABLE_ASSERTIONS) {
    assert(op_num <= HERMES_MAX_BATCH_SIZE);
    assert(read_write_ops != NULL || type != acks);
    assert(node_suspected != NULL || type != invs);
  }

  hermes_print_dispatcher(type, op_num, thread_id);
  // We first lookup the key in the datastore.
  // The first two @I loops work for both GETs and PUTs.
  for (int I = 0; I < op_num; I++) {
    spacetime_op_meta_t* op_ptr =
        (spacetime_op_meta_t*)&op_array[sizeof_op_elem * I];
    hermes_assertions_begin_dispatcher(type, op_ptr);
    if (hermes_skip_dispatcher(type, op_ptr, node_suspected)) continue;

    bkt[I] = op_ptr->key.bkt & kv.hash_table.bkt_mask;
    bkt_ptr[I] = &kv.hash_table.ht_index[bkt[I]];
    __builtin_prefetch(bkt_ptr[I], 0, 0);
    tag[I] = op_ptr->key.tag;

    key_in_store[I] = 0;
    kv_ptr[I] = NULL;
  }

  for (int I = 0; I < op_num; I++) {
    spacetime_op_meta_t* op_ptr =
        (spacetime_op_meta_t*)&op_array[sizeof_op_elem * I];
    if (hermes_skip_dispatcher(type, op_ptr, node_suspected)) continue;
    for (int j = 0; j < 8; j++) {
      if (bkt_ptr[I]->slots[j].in_use == 1 &&
          bkt_ptr[I]->slots[j].tag == tag[I]) {
        uint64_t log_offset =
            bkt_ptr[I]->slots[j].offset & kv.hash_table.log_mask;
        // We can interpret the log entry as mica_op, even though it
        // may not contain the full MICA_MAX_VALUE value.
        kv_ptr[I] = (struct mica_op*)&kv.hash_table.ht_log[log_offset];

        // Small values (1--64 bytes) can span 2 cache lines
        __builtin_prefetch(kv_ptr[I], 0, 0);
        __builtin_prefetch((uint8_t*)kv_ptr[I] + 64, 0, 0);

        // Detect if the head has wrapped around for this index entry
        if (kv.hash_table.log_head - bkt_ptr[I]->slots[j].offset >=
            kv.hash_table.log_cap)
          kv_ptr[I] = NULL;  // If so, we mark it "not found"

        break;
      }
    }
  }

  for (int I = 0; I < op_num; I++) {
    spacetime_op_meta_t* op_ptr =
        (spacetime_op_meta_t*)&op_array[sizeof_op_elem * I];
    if (hermes_skip_dispatcher(type, op_ptr, node_suspected)) continue;
    if (kv_ptr[I] != NULL) {
      // We had a tag match earlier. Now compare log entry.
      long long* key_ptr_log = (long long*)kv_ptr[I];
      long long* key_ptr_req = (long long*)&op_ptr->key;

      if (key_ptr_log[1] == key_ptr_req[0]) {  // Key Found 8 Byte keys
        key_in_store[I] = 1;
        hermes_exec_dispatcher(type, op_ptr, kv_ptr[I], curr_membership,
                               (uint8_t)I, read_write_ops);
      }
    }

    if (key_in_store[I] ==
        0)  // KVS miss --> We get here if either tag or log key match failed
      op_ptr->state = ST_MISS;
  }

  hermes_assertions_end_dispatcher(type, read_write_ops);
}


================================================
FILE: src/hermes/hermes_worker.c
================================================
#include <spacetime.h>
#include <time.h>
#include "../../include/utils/concur_ctrl.h"
#include "inline-util.h"
#include "util.h"

///
#include "../../include/hades/hades.h"
#include "../../include/wings/wings.h"
///

int
inv_skip_or_get_sender_id(uint8_t* req)
{
  spacetime_op_t* op_req = (spacetime_op_t*)req;

  if (ENABLE_ASSERTIONS) {
    assert(is_response_code(op_req->op_meta.state) ||
           is_bucket_state_code(op_req->op_meta.state));
    assert(is_input_code(op_req->op_meta.opcode));
  }

  if (op_req->op_meta.state != ST_PUT_SUCCESS &&
      op_req->op_meta.state != ST_RMW_SUCCESS &&
      op_req->op_meta.state != ST_REPLAY_SUCCESS &&
      op_req->op_meta.state != ST_OP_MEMBERSHIP_CHANGE)
    return -1;
  return 0;  // since inv is a bcast we can return any int other than -1
}

void
inv_modify_elem_after_send(uint8_t* req)
{
  spacetime_op_t* op_req = (spacetime_op_t*)req;
  switch (op_req->op_meta.state) {
    case ST_PUT_SUCCESS:
      op_req->op_meta.state = ST_IN_PROGRESS_PUT;
      break;
    case ST_RMW_SUCCESS:
      op_req->op_meta.state = ST_IN_PROGRESS_RMW;
      break;
    case ST_REPLAY_SUCCESS:
      op_req->op_meta.state = ST_IN_PROGRESS_REPLAY;
      break;
    case ST_OP_MEMBERSHIP_CHANGE:
      op_req->op_meta.state = ST_OP_MEMBERSHIP_COMPLETE;
      break;
    default:
      assert(0);
  }
}

void
inv_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)
{
  spacetime_op_t* op = (spacetime_op_t*)triggering_req;
  spacetime_inv_t* inv_to_send = (spacetime_inv_t*)msg_to_send;

  // Copy op to inv, set sender and opcode
  memcpy(inv_to_send, op, sizeof(spacetime_inv_t));
  inv_to_send->op_meta.sender = (uint8_t)machine_id;
  inv_to_send->op_meta.opcode = ST_OP_INV;
  //	//TODO change to include membership change
  //	inv_to_send->op_meta.opcode = (uint8_t) (op->op_meta.state ==
  // ST_OP_MEMBERSHIP_CHANGE ?
  // ST_OP_MEMBERSHIP_CHANGE : ST_OP_INV);
}

int
ack_skip_or_get_sender_id(uint8_t* req)
{
  spacetime_inv_t* inv_req = (spacetime_inv_t*)req;

  if (ENABLE_ASSERTIONS)
    assert(inv_req->op_meta.opcode == ST_INV_SUCCESS ||
           inv_req->op_meta.opcode == ST_OP_INV_ABORT ||
           inv_req->op_meta.opcode == ST_EMPTY);

  uint8_t is_small_msg = inv_req->op_meta.opcode == ST_INV_SUCCESS ? 1 : 0;

  return inv_req->op_meta.opcode == ST_EMPTY
             ? -1
             : wings_set_sender_id_n_msg_type(inv_req->op_meta.sender,
                                              is_small_msg);
}

void
ack_modify_elem_after_send(uint8_t* req)
{
  spacetime_inv_t* inv_req = (spacetime_inv_t*)req;

  // empty inv buffer
  if (inv_req->op_meta.opcode == ST_INV_SUCCESS ||
      inv_req->op_meta.opcode == ST_OP_INV_ABORT ||
      inv_req->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE)
    inv_req->op_meta.opcode = ST_EMPTY;
  else
    assert(0);
}

void
ack_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)
{
  spacetime_inv_t* inv_req = (spacetime_inv_t*)triggering_req;
  spacetime_ack_t* ack_to_send = (spacetime_ack_t*)msg_to_send;
  spacetime_inv_t* inv_to_send = (spacetime_inv_t*)msg_to_send;
  switch (inv_req->op_meta.opcode) {
    case ST_INV_SUCCESS:
      memcpy(ack_to_send, triggering_req,
             sizeof(spacetime_ack_t));  // copy req to next_req_ptr
      ack_to_send->sender = (uint8_t)machine_id;
      ack_to_send->opcode = ST_OP_ACK;
      break;
    case ST_OP_INV_ABORT:
      memcpy(inv_to_send, triggering_req, sizeof(spacetime_inv_t));
      inv_to_send->op_meta.sender = (uint8_t)machine_id;
      inv_to_send->op_meta.opcode = ST_OP_INV_ABORT;
      break;
    default:
      assert(0);
  }
}

int
val_skip_or_get_sender_id(uint8_t* req)
{
  spacetime_ack_t* ack_req = (spacetime_ack_t*)req;
  if (ack_req->opcode == ST_ACK_SUCCESS ||
      ack_req->opcode == ST_OP_MEMBERSHIP_CHANGE) {
    ack_req->opcode = ST_EMPTY;
    return -1;
  } else if (ack_req->opcode == ST_EMPTY)
    return -1;

  if (ENABLE_ASSERTIONS) assert(ack_req->opcode == ST_LAST_ACK_SUCCESS);

  return ack_req->sender;
}

void
val_modify_elem_after_send(uint8_t* req)
{
  spacetime_ack_t* ack_req = (spacetime_ack_t*)req;

  if (ENABLE_ASSERTIONS) assert(ack_req->opcode == ST_LAST_ACK_SUCCESS);

  ack_req->opcode = ST_EMPTY;
}

void
val_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)
{
  spacetime_val_t* val_to_send = (spacetime_val_t*)msg_to_send;

  memcpy(val_to_send, triggering_req,
         sizeof(spacetime_val_t));  // copy req to next_req_ptr
  val_to_send->opcode = ST_OP_VAL;
  val_to_send->sender = (uint8_t)machine_id;
}

int
memb_change_skip_or_get_sender_id(uint8_t* req)
{
  spacetime_op_t* op_req = (spacetime_op_t*)req;
  if (op_req->op_meta.state != ST_PUT_COMPLETE_SEND_VALS &&
      op_req->op_meta.state != ST_RMW_COMPLETE_SEND_VALS &&
      op_req->op_meta.state != ST_REPLAY_COMPLETE_SEND_VALS) {
    return -1;
  }
  return 1;  // it is bcast so just return something greater than zero
}

void
memb_change_modify_elem_after_send(uint8_t* req)
{
  spacetime_op_t* op_req = (spacetime_op_t*)req;
  switch (op_req->op_meta.state) {
    case ST_PUT_COMPLETE_SEND_VALS:
      op_req->op_meta.state = ST_PUT_COMPLETE;
      break;
    case ST_RMW_COMPLETE_SEND_VALS:
      op_req->op_meta.state = ST_RMW_COMPLETE;
      break;
    case ST_REPLAY_COMPLETE_SEND_VALS:
      op_req->op_meta.state = ST_NEW;  // ST_REPLAY_COMPLETE;
      break;
    default:
      assert(0);
  }
}

void
memb_change_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)
{
  spacetime_op_t* op_req = (spacetime_op_t*)triggering_req;
  spacetime_val_t* val_to_send = (spacetime_val_t*)msg_to_send;

  val_to_send->opcode = ST_OP_VAL;
  val_to_send->sender = (uint8_t)machine_id;
  val_to_send->ts = op_req->op_meta.ts;
}

int
rem_write_crd_skip_or_get_sender_id(uint8_t* req)
{
  spacetime_val_t* val_ptr = (spacetime_val_t*)req;

  if (ENABLE_ASSERTIONS)
    assert(val_ptr->opcode == ST_VAL_SUCCESS || val_ptr->opcode == ST_EMPTY);

  return val_ptr->opcode == ST_EMPTY ? -1 : val_ptr->sender;
}

void
rem_write_crd_modify_elem_after_send(uint8_t* req)
{
  spacetime_val_t* val_req = (spacetime_val_t*)req;

  // empty inv buffer
  if (val_req->opcode == ST_VAL_SUCCESS)
    val_req->opcode = ST_EMPTY;
  else
    assert(0);
}

void
print_total_send_recv_msgs(ud_channel_t* inv_ud_c, ud_channel_t* ack_ud_c,
                           ud_channel_t* val_ud_c, ud_channel_t* crd_ud_c)
{
  colored_printf(
      GREEN, "Total Send: invs %d, acks %d, vals %d, crds %d\n",
      inv_ud_c->stats.send_total_msgs, ack_ud_c->stats.send_total_msgs,
      val_ud_c->stats.send_total_msgs, crd_ud_c->stats.send_total_msgs);
  colored_printf(
      GREEN, "Total Recv: invs %d, acks %d, vals %d, crds %d\n",
      inv_ud_c->stats.recv_total_msgs, ack_ud_c->stats.recv_total_msgs,
      val_ud_c->stats.recv_total_msgs, crd_ud_c->stats.recv_total_msgs);
}

void
spin_until_all_nodes_are_in_membership(
    spacetime_group_membership* last_group_membership,
    hades_wings_ctx_t* hw_ctx, uint16_t worker_lid)
{
  bit_vector_t* membership_ptr =
      (bit_vector_t*)&last_group_membership->g_membership;
  bv_reset_all(membership_ptr);
  while (bv_no_setted_bits(*membership_ptr) < machine_num) {
    if (worker_lid == WORKER_WITH_FAILURE_DETECTOR) {
      update_view_and_issue_hbs(hw_ctx);
      if (!bv_are_equal(*membership_ptr, hw_ctx->ctx.curr_g_membership))
        group_membership_update(hw_ctx->ctx);
      poll_for_remote_views(hw_ctx);
    }
    *last_group_membership = group_membership;
  }
}

static inline void
failure_detection_n_membership(ud_channel_t** ud_channel_ptrs,
                               bit_vector_t* last_membership,
                               hades_wings_ctx_t* hw_ctx, uint16_t worker_lid)
{
  if (worker_lid == WORKER_WITH_FAILURE_DETECTOR) {
    update_view_and_issue_hbs(hw_ctx);

    ///< TODO>: We need to fix recovery (RDMA side of wings)!! the following is
    ///< not fully correct
    /// Additionally, this handles only WORKER_WITH_FAILURE_DETECTOR thread
    /// instead of every thread
    if (!bv_are_equal(hw_ctx->ctx.last_local_view.view,
                      hw_ctx->ctx.intermediate_local_view.view)) {
      for (int j = 0; j < 8; ++j)
        if (bv_bit_get(hw_ctx->ctx.last_local_view.view, j) == 0 &&
            bv_bit_get(hw_ctx->ctx.intermediate_local_view.view, j) == 1) {
          printf("W[%d]: updates %d endpoint channels\n", worker_lid, j);
          for (int i = 0; i < TOTAL_WORKER_UD_QPs; ++i) {
            wings_reset_credits(ud_channel_ptrs[i], j);
            wings_reconfigure_wrs_ah(ud_channel_ptrs[i], j);
          }
        }
    }
    //</TODO>

    if (!bv_are_equal(*last_membership, hw_ctx->ctx.curr_g_membership)) {
      group_membership_update(hw_ctx->ctx);
    }

    poll_for_remote_views(hw_ctx);
  }
}

void*
run_worker(void* arg)
{
  assert(is_CR == 0);

  struct thread_params params = *(struct thread_params*)arg;
  uint16_t worker_lid = (uint16_t)params.id;  // Local ID of this worker thread
  uint16_t worker_gid =
      (uint16_t)(machine_id * num_workers +
                 params.id);  // Global ID of this worker thread

  /* --------------------------------------------------------
  ------------------- RDMA WINGS DECLARATIONS---------------
  ---------------------------------------------------------*/
  ud_channel_t ud_channels[TOTAL_WORKER_N_FAILURE_DETECTION_UD_QPs];
  ud_channel_t* ud_channel_ptrs[TOTAL_WORKER_N_FAILURE_DETECTION_UD_QPs];
  ud_channel_t* inv_ud_c = &ud_channels[INV_UD_QP_ID];
  ud_channel_t* ack_ud_c = &ud_channels[ACK_UD_QP_ID];
  ud_channel_t* val_ud_c = &ud_channels[VAL_UD_QP_ID];
  ud_channel_t* crd_ud_c = &ud_channels[CRD_UD_QP_ID];

  for (int i = 0; i < TOTAL_WORKER_N_FAILURE_DETECTION_UD_QPs; ++i)
    ud_channel_ptrs[i] = &ud_channels[i];

  const uint8_t is_bcast = 1;
  const uint8_t stats_on = 1;
  const uint8_t prints_on = 1;
  const uint8_t is_hdr_only = 0;
  const uint8_t expl_crd_ctrl = 0;
  const uint8_t disable_crd_ctrl = 0;

  char inv_qp_name[200], ack_qp_name[200], val_qp_name[200];
  sprintf(inv_qp_name, "%s%d", "\033[31mINV\033[0m", worker_lid);
  sprintf(ack_qp_name, "%s%d", "\033[33mACK\033[0m", worker_lid);
  sprintf(val_qp_name, "%s%d", "\033[1m\033[32mVAL\033[0m", worker_lid);

  // WARNING: We use the ack channel to send/recv both acks and rmw-invs if RMWs
  // are enabled
  uint16_t ack_size =
      ENABLE_RMWs ? sizeof(spacetime_inv_t) : sizeof(spacetime_ack_t);

  uint8_t inv_inlining =
      (DISABLE_INLINING == 0 &&
       max_coalesce * sizeof(spacetime_inv_t) < WINGS_MAX_SUPPORTED_INLINING)
          ? 1
          : 0;
  uint8_t ack_inlining =
      (DISABLE_INLINING == 0 &&
       max_coalesce * ack_size < WINGS_MAX_SUPPORTED_INLINING)
          ? 1
          : 0;
  uint8_t val_inlining =
      (DISABLE_INLINING == 0 &&
       max_coalesce * sizeof(spacetime_val_t) < WINGS_MAX_SUPPORTED_INLINING)
          ? 1
          : 0;

  wings_ud_channel_init(inv_ud_c, inv_qp_name, REQ, (uint8_t)max_coalesce,
                        sizeof(spacetime_inv_t), 0, inv_inlining, is_hdr_only,
                        is_bcast, disable_crd_ctrl, expl_crd_ctrl, ack_ud_c,
                        (uint8_t)credits_num, machine_num, (uint8_t)machine_id,
                        stats_on, prints_on);
  wings_ud_channel_init(ack_ud_c, ack_qp_name, RESP, (uint8_t)max_coalesce,
                        ack_size, sizeof(spacetime_ack_t), ack_inlining,
                        is_hdr_only, 0, disable_crd_ctrl, expl_crd_ctrl,
                        inv_ud_c, (uint8_t)credits_num, machine_num,
                        (uint8_t)machine_id, stats_on, prints_on);
  wings_ud_channel_init(val_ud_c, val_qp_name, REQ, (uint8_t)max_coalesce,
                        sizeof(spacetime_val_t), 0, val_inlining, is_hdr_only,
                        is_bcast, disable_crd_ctrl, 1, crd_ud_c,
                        (uint8_t)credits_num, machine_num, (uint8_t)machine_id,
                        stats_on, prints_on);

  ///< HADES> Failure Detector Init
  hades_wings_ctx_t hw_ctx;
  uint16_t total_ud_qps = TOTAL_WORKER_UD_QPs;
  if (ENABLE_HADES_FAILURE_DETECTION &&
      worker_lid == WORKER_WITH_FAILURE_DETECTOR) {
    total_ud_qps = TOTAL_WORKER_N_FAILURE_DETECTION_UD_QPs;
    ud_channel_t* hviews_c = &ud_channels[TOTAL_WORKER_UD_QPs];
    ud_channel_t* hviews_crd_c = &ud_channels[TOTAL_WORKER_UD_QPs + 1];

    const uint16_t max_views_to_poll = 10;
    const uint32_t send_view_every_us = 100;
    const uint32_t update_local_view_ms = 10;

    hades_wings_ctx_init(&hw_ctx, machine_id, machine_num, max_views_to_poll,
                         send_view_every_us, update_local_view_ms, hviews_c,
                         hviews_crd_c, worker_lid);
  }
  ///</HADES>

  wings_setup_channel_qps_and_recvs(ud_channel_ptrs, total_ud_qps,
                                    g_share_qs_barrier, worker_lid);

  uint16_t ops_len =
      (uint16_t)(credits_num * remote_machine_num *
                 max_coalesce);  // credits * remote_machines * max_req_coalesce
  assert(ops_len >= inv_ud_c->recv_pkt_buff_len);
  assert(ops_len >= ack_ud_c->recv_pkt_buff_len);
  assert(ops_len >= val_ud_c->recv_pkt_buff_len);

  /* -------------------------------------------------------
  ------------------- OTHER DECLARATIONS--------------------
  ---------------------------------------------------------*/
  // Intermediate buffs where reqs are copied from incoming_* buffs in order to
  // get passed to the KVS
  spacetime_op_t* ops;
  spacetime_inv_t* inv_recv_ops;
  spacetime_ack_t*
      ack_recv_ops;  // WARNING!! This can be spacetime_ack_t / spacetime_inv_t
                     // * depends if RMWs are disabled or not
  spacetime_val_t* val_recv_ops;

  setup_kvs_buffs(&ops, &inv_recv_ops, &ack_recv_ops, &val_recv_ops);

  struct spacetime_trace_command* trace;
  trace_init(&trace, worker_gid);

  ////
  spacetime_op_t* n_hottest_keys_in_ops_get[COALESCE_N_HOTTEST_KEYS];
  spacetime_op_t* n_hottest_keys_in_ops_put[COALESCE_N_HOTTEST_KEYS];
  for (int i = 0; i < COALESCE_N_HOTTEST_KEYS; ++i) {
    n_hottest_keys_in_ops_get[i] = NULL;
    n_hottest_keys_in_ops_put[i] = NULL;
  }
  ////

  int node_suspected = -1;
  uint32_t trace_iter = 0;
  uint16_t rolling_inv_index = 0;
  uint16_t invs_polled = 0, acks_polled = 0, vals_polled = 0;
  uint8_t has_outstanding_vals = 0, has_remaining_vals_from_memb_change = 0;

  uint32_t* num_of_iters_serving_op = malloc(max_batch_size * sizeof(uint32_t));
  for (int i = 0; i < max_batch_size; ++i)
    num_of_iters_serving_op[i] = 0;

  /// Spawn stats thread
  if (worker_lid == 0) {
    if (spawn_stats_thread() != 0)
      colored_printf(RED, "Stats thread was not successfully spawned \n");
  }

  struct timespec stopwatch_for_req_latency;

  // Membership init
  bit_vector_t* membership_ptr =
      ENABLE_HADES_FAILURE_DETECTION
          ? (bit_vector_t*)&group_membership.g_membership
          : NULL;
  if (ENABLE_HADES_FAILURE_DETECTION) {
    spin_until_all_nodes_are_in_membership(&group_membership, &hw_ctx,
                                           worker_lid);
    printf("~~~~~~~~~ Starting while ! ~~~~~~~~~\n");
  }

  /* -----------------------------------------------------
 ------------------------Main Loop--------------------
     ----------------------------------------------------- */

  struct timespec stopwatch_for_fd_warmup;
  get_rdtsc_timespec(&stopwatch_for_fd_warmup);
  uint8_t fd_warmup_time_has_passed = 0;

  while (true) {
    // Check something periodically (e.g., stats)
    if (unlikely(w_stats[worker_lid].total_loops % M_16 == 0)) {
      //			print_total_send_recv_msgs_n_credits(&inv_ud_c,
      //&ack_ud_c, &val_ud_c, &crd_ud_c);
    }

    if (!ENABLE_HADES_FAILURE_DETECTION || fd_warmup_time_has_passed == 1) {
      node_suspected =
          refill_ops(&trace_iter, worker_lid, trace, ops,
                     num_of_iters_serving_op, &stopwatch_for_req_latency,
                     n_hottest_keys_in_ops_get, n_hottest_keys_in_ops_put);

      hermes_batch_ops_to_KVS(local_ops, (uint8_t*)ops, max_batch_size,
                              sizeof(spacetime_op_t), group_membership, NULL,
                              NULL, (uint8_t)worker_lid);

      stop_latency_of_completed_reads(ops, worker_lid,
                                      &stopwatch_for_req_latency);

      if (update_ratio > 0) {
        ///~~~~~~~~~~~~~~~~~~~~~~INVS~~~~~~~~~~~~~~~~~~~~~~~~~~~
        wings_issue_pkts(inv_ud_c, membership_ptr, (uint8_t*)ops,
                         (uint16_t)max_batch_size, sizeof(spacetime_op_t),
                         &rolling_inv_index, inv_skip_or_get_sender_id,
                         inv_modify_elem_after_send, inv_copy_and_modify_elem);

        /// Poll for INVs
        invs_polled = wings_poll_buff_and_post_recvs(inv_ud_c, ops_len,
                                                     (uint8_t*)inv_recv_ops);

        if (invs_polled > 0) {
          hermes_batch_ops_to_KVS(invs, (uint8_t*)inv_recv_ops, invs_polled,
                                  sizeof(spacetime_inv_t), group_membership,
                                  &node_suspected, ops, (uint8_t)worker_lid);

          ///~~~~~~~~~~~~~~~~~~~~~~ACKS~~~~~~~~~~~~~~~~~~~~~~~~~~~
          wings_issue_pkts(
              ack_ud_c, membership_ptr, (uint8_t*)inv_recv_ops, invs_polled,
              sizeof(spacetime_inv_t), NULL, ack_skip_or_get_sender_id,
              ack_modify_elem_after_send, ack_copy_and_modify_elem);

          if (ENABLE_ASSERTIONS)
            assert(inv_ud_c->stats.recv_total_msgs ==
                   ack_ud_c->stats.send_total_msgs);
        }

        if (has_outstanding_vals == 0 &&
            has_remaining_vals_from_memb_change == 0) {
          /// Poll for Acks
          acks_polled = wings_poll_buff_and_post_recvs(ack_ud_c, ops_len,
                                                       (uint8_t*)ack_recv_ops);

          if (acks_polled > 0) {
            hermes_batch_ops_to_KVS(acks, (uint8_t*)ack_recv_ops, acks_polled,
                                    ack_size, group_membership, NULL, ops,
                                    (uint8_t)worker_lid);

            stop_latency_of_completed_writes(ops, worker_lid,
                                             &stopwatch_for_req_latency);
          }
        }

        if (!DISABLE_VALS_FOR_DEBUGGING) {
          ///~~~~~~~~~~~~~~~~~~~~~~ VALs ~~~~~~~~~~~~~~~~~~~~~~~~~~~
          if (has_remaining_vals_from_memb_change > 0)
            has_remaining_vals_from_memb_change = wings_issue_pkts(
                val_ud_c, membership_ptr, (uint8_t*)ops, max_batch_size,
                sizeof(spacetime_op_t), NULL, memb_change_skip_or_get_sender_id,
                memb_change_modify_elem_after_send,
                memb_change_copy_and_modify_elem);
          else
            has_outstanding_vals = wings_issue_pkts(
                val_ud_c, membership_ptr, (uint8_t*)ack_recv_ops,
                ack_ud_c->recv_pkt_buff_len, ack_size, NULL,
                val_skip_or_get_sender_id, val_modify_elem_after_send,
                val_copy_and_modify_elem);

          /// Poll for Vals
          vals_polled = wings_poll_buff_and_post_recvs(val_ud_c, ops_len,
                                                       (uint8_t*)val_recv_ops);

          if (vals_polled > 0) {
            hermes_batch_ops_to_KVS(vals, (uint8_t*)val_recv_ops, vals_polled,
                                    sizeof(spacetime_val_t), group_membership,
                                    NULL, NULL, (uint8_t)worker_lid);

            ///~~~~~~~~~~~~~~~~~~~~~~CREDITS~~~~~~~~~~~~~~~~~~~~~~~~~~~
            wings_issue_credits(
                crd_ud_c, membership_ptr, (uint8_t*)val_recv_ops, ops_len,
                sizeof(spacetime_val_t), rem_write_crd_skip_or_get_sender_id,
                rem_write_crd_modify_elem_after_send);
          }
        }
      }
    } else if (ENABLE_HADES_FAILURE_DETECTION &&
               time_elapsed_in_sec(stopwatch_for_fd_warmup) > 2) {
      fd_warmup_time_has_passed = 1;
      printf("~~~~~~~~~ Starting execution! ~~~~~~~~~\n");
    }

    // Failure Detection and Membership
    if (ENABLE_HADES_FAILURE_DETECTION) {
      failure_detection_n_membership(ud_channel_ptrs, membership_ptr, &hw_ctx,
                                     worker_lid);

      if (group_membership_has_changed(&group_membership, worker_lid)) {
        /// Complete inprogress updates/replays waiting for ACKS only from
        /// failed nodes
        hermes_batch_ops_to_KVS(local_ops_after_membership_change,
                                (uint8_t*)ops, max_batch_size,
                                sizeof(spacetime_op_t), group_membership, NULL,
                                NULL, (uint8_t)worker_lid);

        stop_latency_of_completed_writes(ops, worker_lid,
                                         &stopwatch_for_req_latency);

        if (!DISABLE_VALS_FOR_DEBUGGING)
          /// Bcast VAL msgs for those completed update/replays
          has_remaining_vals_from_memb_change = wings_issue_pkts(
              val_ud_c, membership_ptr, (uint8_t*)ops, max_batch_size,
              sizeof(spacetime_op_t), NULL, memb_change_skip_or_get_sender_id,
              memb_change_modify_elem_after_send,
              memb_change_copy_and_modify_elem);
      }
    }
    w_stats[worker_lid].total_loops++;
  }
}


================================================
FILE: src/hermes/main.c
================================================
#define _GNU_SOURCE
#include <getopt.h>
#include <infiniband/verbs.h>
#include <malloc.h>
#include <pthread.h>
#include <stdio.h>
#include "../../include/utils/bit_vector.h"
#include "../../include/utils/concur_ctrl.h"
#include "../../include/wings/wings_api.h"
#include "config.h"
#include "hrd.h"
#include "spacetime.h"
#include "util.h"

// Global vars
struct latency_counters latency_count;
volatile struct worker_stats w_stats[MAX_WORKERS_PER_MACHINE];

dbit_vector_t* g_share_qs_barrier;
spacetime_group_membership group_membership;

// Global config vars
uint8_t is_CR;
int num_workers;
int update_ratio;
int rmw_ratio;
int credits_num;
int max_coalesce;
int max_batch_size;  // for batches to KVS

int machine_num;
int remote_machine_num;
int worker_measuring_latency;

// This is required only when Hades failure detection is disabled
void
group_membership_init(void)
{
  group_membership.num_of_alive_remotes = remote_machine_num;
  seqlock_init(&group_membership.lock);
  bv_init((bit_vector_t*)&group_membership.g_membership);

  for (uint8_t i = 0; i < machine_num; ++i)
    bv_bit_set((bit_vector_t*)&group_membership.g_membership, i);

  bv_copy((bit_vector_t*)&group_membership.w_ack_init,
          group_membership.g_membership);
  bv_reverse((bit_vector_t*)&group_membership.w_ack_init);
  bv_bit_set((bit_vector_t*)&group_membership.w_ack_init, (uint8_t)machine_id);
}

int
main(int argc, char* argv[])
{
  int i, c;
  is_roce = -1;
  machine_id = -1;

  // config vars
  is_CR = 1;
  num_workers = -1;
  update_ratio = -1;
  rmw_ratio = -1;
  credits_num = -1;
  max_coalesce = -1;
  max_batch_size = -1;
  remote_IP = (char*)malloc(16 * sizeof(char));

  machine_num = -1;
  remote_machine_num = -1;
  worker_measuring_latency = -1;

  //	green_printf("UD size: %d ibv_grh + crd size: %d \n",
  // sizeof(ud_req_crd_t), sizeof(struct ibv_grh) + sizeof(spacetime_crd_t));
  //	static_assert(sizeof(ud_req_crd_t) == sizeof(struct ibv_grh) +
  // sizeof(spacetime_crd_t), ""); ///CRD --> 48 Bytes instead of 43

  struct thread_params* param_arr;
  pthread_t* thread_arr;

  static struct option opts[] = {
      {.name = "machine-id", .has_arg = 1, .val = 'm'},
      {.name = "lat-worker", .has_arg = 1, .val = 'l'},
      {.name = "is-roce", .has_arg = 1, .val = 'r'},
      {.name = "rmw-ratio", .has_arg = 1, .val = 'R'},
      {.name = "dev-name", .has_arg = 1, .val = 'd'},
      {.name = "write-ratio", .has_arg = 1, .val = 'w'},
      {.name = "num-workers", .has_arg = 1, .val = 'W'},
      {.name = "num-machines", .has_arg = 1, .val = 'M'},
      {.name = "credits", .has_arg = 1, .val = 'c'},
      {.name = "max-coalesce", .has_arg = 1, .val = 'C'},
      {.name = "max-batch-size", .has_arg = 1, .val = 'b'},
      {.name = "hermes", .has_arg = 0, .val = 'H'},
      {0}};

  /* Parse and check arguments */
  while (1) {
    c = getopt_long(argc, argv, "m:r:l:R:d:w:c:C:W:M:H", opts, NULL);
    if (c == -1) break;

    switch (c) {
      case 'm':
        machine_id = atoi(optarg);
        break;
      case 'r':
        is_roce = atoi(optarg);
        break;
      case 'l':
        worker_measuring_latency = atoi(optarg);
        break;
      case 'd':
        memcpy(dev_name, optarg, strlen(optarg));
        break;
      // Config vars
      case 'w':
        update_ratio = atoi(optarg);
        break;
      case 'R':
        rmw_ratio = atoi(optarg);
        break;
      case 'W':
        num_workers = atoi(optarg);
        break;
      case 'c':
        credits_num = atoi(optarg);
        break;
      case 'C':
        max_coalesce = atoi(optarg);
        break;
      case 'b':
        max_batch_size = atoi(optarg);
        break;
      case 'H':
        is_CR = 0;
        break;
      case 'M':
        machine_num = atoi(optarg);
        remote_machine_num = machine_num - 1;
        break;
      default:
        printf("Invalid argument %d\n", c);
        assert(false);
    }
  }

  // If arguments not passed use the default values from header file
  if (update_ratio == -1) update_ratio = DEFAULT_UPDATE_RATIO;
  if (rmw_ratio == -1) rmw_ratio = ENABLE_RMWs ? DEFAULT_RMW_RATIO : 0;
  if (num_workers == -1) num_workers = DEFAULT_WORKERS_PER_MACHINE;
  if (max_coalesce == -1) max_coalesce = MAX_REQ_COALESCE;
  if (max_batch_size == -1) max_batch_size = MAX_BATCH_KVS_OPS_SIZE;
  if (credits_num == -1)
    credits_num = is_CR ? MAX_CREDITS_PER_REMOTE_WORKER_CR
                        : MAX_CREDITS_PER_REMOTE_WORKER;
  if (worker_measuring_latency == -1 && DEFAULT_MEASURE_LATENCY)
    worker_measuring_latency = DEFAULT_WORKER_MEASURING_LATENCY;
  if (machine_num == -1) {
    machine_num = MAX_MACHINE_NUM;
    remote_machine_num = MAX_REMOTE_MACHINES;
  }

  assert(ENABLE_RMWs || rmw_ratio == 0);
  assert(rmw_ratio != 0 || ENABLE_RMWs == 0);
  // WARNING: Some structs are statically allocated using
  // MAX_WORKERS_PER_MACHINE / MAX_BATCH_KVS_OPS_SIZE
  assert(num_workers <= MAX_WORKERS_PER_MACHINE);
  assert(max_batch_size <= MAX_BATCH_KVS_OPS_SIZE);
  assert(machine_num > 1 && machine_num <= MAX_MACHINE_NUM);
  assert(worker_measuring_latency == -1 ||
         worker_measuring_latency < num_workers);

  assert(!ENABLE_VIRTUAL_NODE_IDS || VIRTUAL_NODE_IDS_PER_NODE > machine_num);
  assert(!ENABLE_VIRTUAL_NODE_IDS ||
         machine_num * VIRTUAL_NODE_IDS_PER_NODE < 255);

  if (num_workers > 1)
    dbv_init(&g_share_qs_barrier, (uint8_t)num_workers);
  else
    g_share_qs_barrier = NULL;

  printf(
      "update rate: %d (RMW rate %d) | workers %d | batch size %d| CREDITS %d "
      "| coalesce %d |\n",
      update_ratio, rmw_ratio, num_workers, max_batch_size, credits_num,
      max_coalesce);

  thread_arr = malloc(num_workers * sizeof(pthread_t));
  param_arr = malloc(num_workers * sizeof(struct thread_params));

  pthread_attr_t attr;
  cpu_set_t cpus_w;

  group_membership_init();
  init_stats((void*)w_stats);
  spacetime_init(machine_id);

  pthread_attr_init(&attr);
  int w_core, init_core = SOCKET_TO_START_SPAWNING_THREADS;
  for (i = 0; i < num_workers; i++) {
    if (USE_ALL_SOCKETS && ENABLE_HYPERTHREADING)
      w_core = init_core + i;
    else
      w_core = 2 * i + init_core;

    assert(w_core < TOTAL_HW_CORES);
    assert(ENABLE_HYPERTHREADING ||
           w_core < TOTAL_NUMBER_OF_SOCKETS * TOTAL_CORES_PER_SOCKET);

    param_arr[i].id = i;

    CPU_ZERO(&cpus_w);
    CPU_SET(w_core, &cpus_w);
    pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpus_w);
    pthread_create(&thread_arr[i], &attr, run_worker, &param_arr[i]);
  }
  colored_printf(YELLOW, "Sizes: {Op: %d, Object Meta %d, Value %d},\n",
                 sizeof(spacetime_op_t), sizeof(spacetime_object_meta),
                 ST_VALUE_SIZE);
  colored_printf(YELLOW, "Coherence msg Sizes: {Inv: %d, Ack: %d, Val: %d}\n",
                 sizeof(spacetime_inv_t), sizeof(spacetime_ack_t),
                 sizeof(spacetime_val_t));
  colored_printf(
      YELLOW, "Max Coalesce Packet Sizes: {Inv: %d, Ack: %d, Val: %d}\n",
      sizeof(wings_ud_send_pkt_t) + max_coalesce * sizeof(spacetime_inv_t),
      sizeof(wings_ud_send_pkt_t) + max_coalesce * sizeof(spacetime_ack_t),
      sizeof(wings_ud_send_pkt_t) + max_coalesce * sizeof(spacetime_val_t));

  for (i = 0; i < num_workers; i++)
    pthread_join(thread_arr[i], NULL);

  return 0;
}

//////////////////////////////////////////////////////////////////////////////////
/// Static asserts to ensure only correct configs
//////////////////////////////////////////////////////////////////////////////////

static_assert(MICA_MAX_VALUE >= ST_VALUE_SIZE, "");
static_assert(MAX_MACHINE_NUM <= 8,
              "");  // TODO haven't test bit vectors with more than 8 nodes
static_assert(MAX_MACHINE_NUM <= GROUP_MEMBERSHIP_ARRAY_SIZE * 8,
              "");  // bit vector for acks / group membership
static_assert(MAX_MACHINE_NUM <= 255, "");

static_assert(KV_SOCKET < TOTAL_NUMBER_OF_SOCKETS &&
                  SOCKET_TO_START_SPAWNING_THREADS < TOTAL_NUMBER_OF_SOCKETS,
              "");

static_assert((ENABLE_HYPERTHREADING == 1 && USE_ALL_SOCKETS == 1) ||
                  MAX_WORKERS_PER_MACHINE <= TOTAL_CORES_PER_SOCKET,
              "");
static_assert(MAX_WORKERS_PER_MACHINE <= TOTAL_HW_CORES, "");

/// Assertions for failures
static_assert(FAKE_FAILURE == 0 || NODE_TO_FAIL < MAX_MACHINE_NUM, "");
static_assert(FAKE_FAILURE == 0 ||
                  ROUNDS_BEFORE_FAILURE < PRINT_NUM_STATS_BEFORE_EXITING,
              "");
static_assert(FAKE_FAILURE == 0 ||
                  WORKER_WITH_FAILURE_DETECTOR < MAX_WORKERS_PER_MACHINE,
              "");

static_assert(MAX_MACHINE_NUM < TIE_BREAKER_ID_EMPTY, "");
static_assert(MAX_MACHINE_NUM < LAST_WRITER_ID_EMPTY, "");
static_assert(MAX_BATCH_KVS_OPS_SIZE < ST_OP_BUFFER_INDEX_EMPTY,
              "");  /// 1B write_buffer_index and 255 is used as "empty" value

/// Make sure that assigned numbers to States are monotonically increasing with
/// the following order
static_assert(VALID_STATE < INVALID_STATE, "");
static_assert(INVALID_STATE < INVALID_WRITE_STATE, "");
static_assert(INVALID_WRITE_STATE < WRITE_STATE, "");
static_assert(WRITE_STATE < REPLAY_STATE, "");

static_assert(ENABLE_RMWs == 0 || ENABLE_RMWs == 1, "");


================================================
FILE: src/hermes/spacetime.c
================================================
//
// Created by akatsarakis on 04/05/18.
//
#include <config.h>
#include <inline-util.h>
#include <spacetime.h>
#include <util.h>
#include "../../include/utils/concur_ctrl.h"

/*
 * Initialize the spacetime using a Mica instances and adding the timestamps
 * and locks to the keys of mica-herd-herd structure
 */

struct spacetime_kv kv;

void
spacetime_object_meta_init(spacetime_object_meta* ol)
{
  cctrl_init(&ol->cctrl);
  ol->state = VALID_STATE;
  ol->last_writer_id = LAST_WRITER_ID_EMPTY;
  ol->op_buffer_index = ST_OP_BUFFER_INDEX_EMPTY;
}

void
spacetime_init(int instance_id)
{
  // TODO may add kvs stats
  mica_init(&kv.hash_table, instance_id, KV_SOCKET, SPACETIME_NUM_BKTS,
            SPACETIME_LOG_CAP);
  spacetime_populate_fixed_len(&kv, SPACETIME_NUM_KEYS, KVS_VALUE_SIZE);
}

void
spacetime_populate_fixed_len(struct spacetime_kv* _kv, int n, int val_len)
{
  assert(n > 0);
  assert(val_len > 0 && val_len <= KVS_VALUE_SIZE);

  /* This is needed for the eviction message below to make sense */
  assert(_kv->hash_table.num_insert_op == 0 &&
         _kv->hash_table.num_index_evictions == 0);

  struct mica_op op;
  struct mica_resp resp;
  unsigned long long* op_key = (unsigned long long*)&op.key;
  spacetime_object_meta initial_meta;
  spacetime_object_meta_init(&initial_meta);

  /* Generate the keys to insert */
  uint128* key_arr = mica_gen_keys(n);
  op.val_len = (uint8_t)(val_len >> SHIFT_BITS);
  op.opcode = ST_OP_PUT;
  spacetime_object_meta* value_ptr = (spacetime_object_meta*)op.value;
  memcpy((void*)value_ptr, (void*)&initial_meta, sizeof(spacetime_object_meta));
  for (int i = n - 1; i >= 0; i--) {
    op_key[0] = key_arr[i].first;
    op_key[1] = key_arr[i].second;
    /// printf("Key Metadata: Lock(%u), State(%u), Counter(%u:%u)\n",
    /// op.key.meta.lock,
    /// op.key.meta.state, op.key.meta.version, op.key.meta.cid);
    uint8_t val = (uint8_t)('a' + (i % 20));

    memset((void*)&value_ptr[1], val, ST_VALUE_SIZE);
    mica_insert_one(&_kv->hash_table, &op, &resp);
  }

  assert(_kv->hash_table.num_insert_op == n);
  colored_printf(YELLOW,
                 "Spacetime: Populated instance %d with %d keys, length = %d. "
                 "Index eviction fraction = %.4f.\n",
                 _kv->hash_table.instance_id, n, val_len,
                 (double)_kv->hash_table.num_index_evictions /
                     _kv->hash_table.num_insert_op);
}


================================================
FILE: src/hermes/stats.c
================================================
#include "util.h"

static inline void
xput_file_name(char* filename)
{
  char* path = "./results/xput/per-node";

  sprintf(filename, "%s/%s_xPut_m_%d_wr_%.1f_rmw_%.1f_wk_%d_b_%d_c_%d%s-%d.txt",
          path, is_CR == 1 ? "CR" : "Hermes", machine_num, update_ratio / 10.0,
          rmw_ratio / 10.0, num_workers, max_batch_size, credits_num,
          FEED_FROM_TRACE == 1 ? "_a_0.99" : "_uni", machine_id);
}

// assuming microsecond latency
void
dump_xput_stats(double xput_in_miops)
{
  static uint8_t no_func_calls = 0;  /// WARNING this is not thread safe.

  assert(no_func_calls < 250);

  FILE* xput_stats_fd;
  char filename[128];
  xput_file_name(filename);

  const char* open_mode = no_func_calls == 0 ? "w" : "a";
  xput_stats_fd = fopen(filename, open_mode);

  fprintf(xput_stats_fd, "node%d_miops-%d: %.2f\n", machine_id, no_func_calls,
          xput_in_miops);

  fclose(xput_stats_fd);
  no_func_calls++;

  //    printf("xPut stats saved at %s\n", filename);
}

// assuming microsecond latency
void
dump_latency_stats(void)
{
  FILE* latency_stats_fd;
  char filename[128];
  char* path = "./results/latency";

  sprintf(filename, "%s/%s_latency_m_%d_w_%d_b_%d_wr_%d_rmw_%d_c_%d%s.csv",
          path, is_CR == 1 ? "CR" : "Hermes", machine_num, num_workers,
          max_batch_size, update_ratio, rmw_ratio, credits_num,
          FEED_FROM_TRACE == 1 ? "_a_0.99" : "");

  latency_stats_fd = fopen(filename, "w");
  fprintf(latency_stats_fd, "#---------------- Read Reqs --------------\n");
  for (int i = 0; i < LATENCY_BUCKETS; ++i)
    fprintf(latency_stats_fd, "reads: %d, %d\n", i * LATENCY_PRECISION,
            latency_count.read_reqs[i]);
  fprintf(latency_stats_fd, "reads: -1, %d\n",
          latency_count.read_reqs[LATENCY_BUCKETS]);  // print outliers
  fprintf(latency_stats_fd, "reads-hl: %d\n",
          latency_count.max_read_latency);  // print max read latency

  fprintf(latency_stats_fd, "#---------------- Write Reqs ---------------\n");
  for (int i = 0; i < LATENCY_BUCKETS; ++i)
    fprintf(latency_stats_fd, "writes: %d, %d\n", i * LATENCY_PRECISION,
            latency_count.write_reqs[i]);
  fprintf(latency_stats_fd, "writes: -1, %d\n",
          latency_count.write_reqs[LATENCY_BUCKETS]);  // print outliers
  fprintf(latency_stats_fd, "writes-hl: %d\n",
          latency_count.max_write_latency);  // print max write latency

  fclose(latency_stats_fd);

  printf("Latency stats saved at %s\n", filename);
}

static inline double
safe_division(double a, double b)
{
  return b == 0 ? 0 : a / b;
}

void*
print_stats_thread(void* no_arg)
{
  uint16_t i, print_count = 0;
  long long all_worker_xput = 0;
  long long all_worker_wrs = 0;
  long long all_worker_rmws = 0;
  long long all_worker_aborted_rmws = 0;
  double total_throughput = 0;
  double total_rd_throughput = 0;
  double total_rmw_aborts = 0;

  double total_wr_throughput = 0;
  double total_rmw_throughput = 0;
  //    int sleep_time = 20;
  struct worker_stats curr_w_stats[MAX_WORKERS_PER_MACHINE],
      prev_w_stats[MAX_WORKERS_PER_MACHINE];
  struct stats all_stats;
  sleep(4);
  memcpy(prev_w_stats, (void*)w_stats,
         MAX_WORKERS_PER_MACHINE * (sizeof(struct worker_stats)));
  struct timespec start, end;
  clock_gettime(CLOCK_REALTIME, &start);
  while (true) {
    usleep(PRINT_STATS_EVERY_MSECS * 1000);
    clock_gettime(CLOCK_REALTIME, &end);
    double seconds = (end.tv_sec - start.tv_sec) +
                     (double)(end.tv_nsec - start.tv_nsec) / 1000000001;
    start = end;
    memcpy(curr_w_stats, (void*)w_stats,
           MAX_WORKERS_PER_MACHINE * (sizeof(struct worker_stats)));
    all_worker_xput = 0;
    all_worker_wrs = 0;
    all_worker_rmws = 0;
    all_worker_aborted_rmws = 0;
    print_count++;
    if (FAKE_FAILURE == 1 && machine_id == NODE_TO_FAIL &&
        print_count == ROUNDS_BEFORE_FAILURE) {
      colored_printf(RED, "---------------------------------------\n");
      colored_printf(RED, "------------  NODE FAILED  ------------\n");
      colored_printf(RED, "---------------------------------------\n");
      exit(0);
    }
    if (EXIT_ON_STATS_PRINT == 1 &&
        print_count == PRINT_NUM_STATS_BEFORE_EXITING) {
      if (worker_measuring_latency != -1 && machine_id == 0)
        dump_latency_stats();
      if (DUMP_XPUT_STATS_TO_FILE) {
        char filename[128];
        xput_file_name(filename);
        printf("xPut stats (of this node) saved at %s\n", filename);
      }
      printf("---------------------------------------\n");
      printf("------------ RUN FINISHED -------------\n");
      printf("---------------------------------------\n");
      exit(0);
    }
    seconds *= MILLION;  // compute only MIOPS
    for (i = 0; i < num_workers; i++) {
      all_worker_xput += curr_w_stats[i].completed_ops_per_worker -
                         prev_w_stats[i].completed_ops_per_worker;
      all_worker_wrs += curr_w_stats[i].completed_wrs_per_worker -
                        prev_w_stats[i].completed_wrs_per_worker;
      all_worker_rmws += curr_w_stats[i].completed_rmws_per_worker -
                         prev_w_stats[i].completed_rmws_per_worker;
      all_worker_aborted_rmws += curr_w_stats[i].aborted_rmws_per_worker -
                                 prev_w_stats[i].aborted_rmws_per_worker;
      all_stats.xput_per_worker[i] =
          (curr_w_stats[i].completed_ops_per_worker -
           prev_w_stats[i].completed_ops_per_worker) /
          seconds;
      all_stats.rmw_xput_per_worker[i] =
          (curr_w_stats[i].completed_rmws_per_worker -
           prev_w_stats[i].completed_rmws_per_worker) /
          seconds;
      all_stats.rmw_abort_rate_per_worker[i] =
          safe_division((curr_w_stats[i].aborted_rmws_per_worker -
                         prev_w_stats[i].aborted_rmws_per_worker),
                        (curr_w_stats[i].completed_rmws_per_worker -
                         prev_w_stats[i].completed_rmws_per_worker));
    }

    memcpy(prev_w_stats, curr_w_stats,
           MAX_WORKERS_PER_MACHINE * (sizeof(struct worker_stats)));
    total_throughput = all_worker_xput / seconds;
    total_wr_throughput = all_worker_wrs / seconds;
    total_rmw_throughput = all_worker_rmws / seconds;
    total_rmw_aborts = safe_division(all_worker_aborted_rmws, all_worker_rmws);
    total_rd_throughput =
        total_throughput - total_wr_throughput - total_rmw_throughput;
    printf("---------------PRINT %d time elapsed %.2f---------------\n",
           print_count, seconds / MILLION);
    colored_printf(GREEN,
                   "NODE MReqs/s: %.2f \n(Rd|Wr|RMW: %.2f|%.2f|%.2f) | RMW "
                   "aborts: %.2f%%)\n",
                   total_throughput, total_rd_throughput, total_wr_throughput,
                   total_rmw_throughput, 100 * total_rmw_aborts);
    if (PRINT_WORKER_STATS) {
      for (i = 0; i < num_workers; i++) {
        //            yellow_printf("W%d: %.2f MIOPS-Batch %.2f(%.2f) -H %.2f -W
        //            %llu -E %.2f -AC %.2f \n", i,
        //            all_stats.xput_per_worker[i],
        //            all_stats.batch_size_per_worker[i],
        //                          all_stats.stalled_time_per_worker[i],
        //                          trace_ratio, curr_w_stats[i].wasted_loops,
        //                          all_stats.empty_reqs_per_worker[i],
        //                          all_stats.average_coalescing_per_worker[i]);
        all_stats.issued_invs_avg_coalesing[i] =
            w_stats[i].issued_invs_per_worker /
            (double)w_stats[i].issued_packet_invs_per_worker;
        all_stats.issued_acks_avg_coalesing[i] =
            w_stats[i].issued_acks_per_worker /
            (double)w_stats[i].issued_packet_acks_per_worker;
        all_stats.issued_vals_avg_coalesing[i] =
            w_stats[i].issued_vals_per_worker /
            (double)w_stats[i].issued_packet_vals_per_worker;
        all_stats.issued_crds_avg_coalesing[i] =
            w_stats[i].issued_crds_per_worker /
            (double)w_stats[i].issued_packet_crds_per_worker;

        all_stats.received_invs_avg_coalesing[i] =
            w_stats[i].received_invs_per_worker /
            (double)w_stats[i].received_packet_invs_per_worker;
        all_stats.received_acks_avg_coalesing[i] =
            w_stats[i].received_acks_per_worker /
            (double)w_stats[i].received_packet_acks_per_worker;
        all_stats.received_vals_avg_coalesing[i] =
            w_stats[i].received_vals_per_worker /
            (double)w_stats[i].received_packet_vals_per_worker;
        all_stats.received_crds_avg_coalesing[i] =
            w_stats[i].received_crds_per_worker /
            (double)w_stats[i].received_packet_crds_per_worker;

        all_stats.percentage_of_wasted_loops[i] =
            w_stats[i].wasted_loops / (double)w_stats[i].total_loops * 100;
        all_stats.completed_reqs_per_loop[i] =
            curr_w_stats[i].completed_ops_per_worker /
            (double)w_stats[i].total_loops;
        colored_printf(CYAN, "W%d: ", i);
        colored_printf(YELLOW,
                       "%.2f MIOPS, Coalescing{Inv: %.2f, Ack: %.2f, Val: "
                       "%.2f, Crd: %.2f}\n",
                       all_stats.xput_per_worker[i],
                       all_stats.issued_invs_avg_coalesing[i],
                       all_stats.issued_acks_avg_coalesing[i],
                       all_stats.issued_vals_avg_coalesing[i],
                       all_stats.issued_crds_avg_coalesing[i]);
        colored_printf(YELLOW,
                       "\t wasted_loops: %.2f%, reqs per loop: %.2f, total "
                       "reqs %d, reqs missed: %d\n",
                       all_stats.percentage_of_wasted_loops[i],
                       all_stats.completed_reqs_per_loop[i],
                       curr_w_stats[i].completed_ops_per_worker,
                       curr_w_stats[i].reqs_missed_in_kvs);
      }
      colored_printf(GREEN, "NODE MReqs/s: %.2f \n", total_throughput);
      printf("---------------------------------------\n");
    }

    if (DUMP_XPUT_STATS_TO_FILE) dump_xput_stats(total_throughput);
  }
}


================================================
FILE: src/hermes/util.c
================================================
//
// Created by akatsarakis on 15/03/18.
//
#define _GNU_SOURCE

#include "util.h"
#include "hrd.h"
#include "inline-util.h"
#include "spacetime.h"

int
spawn_stats_thread(void)
{
  pthread_t* thread_arr = malloc(sizeof(pthread_t));
  pthread_attr_t attr;
  cpu_set_t cpus_stats;
  pthread_attr_init(&attr);
  CPU_ZERO(&cpus_stats);

  if (DEFAULT_THREAD_OF_STAT_THREAD != -1) {
    CPU_SET(DEFAULT_THREAD_OF_STAT_THREAD, &cpus_stats);
  } else {
    if (MAX_WORKERS_PER_MACHINE > 17)
      CPU_SET(39, &cpus_stats);
    else
      CPU_SET(2 * MAX_WORKERS_PER_MACHINE + 2, &cpus_stats);
  }

  pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpus_stats);
  return pthread_create(&thread_arr[0], &attr, print_stats_thread, NULL);
}

uint8_t
is_state_code(uint8_t code)
{
  switch (code) {
    // Object States
    case VALID_STATE:
    case WRITE_STATE:
    case REPLAY_STATE:
    case INVALID_STATE:
    case INVALID_WRITE_STATE:
      return 1;
    default:
      return 0;
  }
}

uint8_t
is_input_code(uint8_t code)
{
  switch (code) {
      // Input opcodes
    case ST_OP_GET:
    case ST_OP_PUT:
    case ST_OP_RMW:
    case ST_OP_INV:
    case ST_OP_ACK:
    case ST_OP_VAL:
    case ST_OP_CRD:
    case ST_OP_MEMBERSHIP_CHANGE:
    case ST_OP_MEMBERSHIP_COMPLETE:
      return 1;
    default:
      return 0;
  }
}

uint8_t
is_response_code(uint8_t code)
{
  switch (code) {
    case ST_GET_COMPLETE:
    case ST_PUT_SUCCESS:
    case ST_PUT_COMPLETE:
    case ST_REPLAY_SUCCESS:
    case ST_REPLAY_COMPLETE:
    case ST_INV_SUCCESS:
    case ST_ACK_SUCCESS:
    case ST_VAL_SUCCESS:
    case ST_LAST_ACK_SUCCESS:
    case ST_LAST_ACK_NO_BCAST_SUCCESS:
    case ST_MISS:
    case ST_GET_STALL:
    case ST_PUT_STALL:
    case ST_PUT_COMPLETE_SEND_VALS:
    case ST_INV_OUT_OF_GROUP:
    // RMW
    case ST_RMW_ABORT:
    case ST_RMW_STALL:
    case ST_RMW_SUCCESS:
    case ST_RMW_COMPLETE:
      return 1;
    default:
      return 0;
  }
}

uint8_t
is_bucket_state_code(uint8_t code)
{
  switch (code) {
    case ST_NEW:
    case ST_EMPTY:
    case ST_COMPLETE:
    case ST_IN_PROGRESS_GET:
    case ST_IN_PROGRESS_PUT:
    case ST_IN_PROGRESS_RMW:
    case ST_IN_PROGRESS_REPLAY:
      return 1;
    default:
      return 0;
  }
}

char*
code_to_str(uint8_t code)
{
  switch (code) {
    // Object States
    case VALID_STATE:
      return "VALID_STATE";
    case INVALID_STATE:
      return "INVALID_STATE";
    case INVALID_WRITE_STATE:
      return "INVALID_WRITE_STATE";
    case WRITE_STATE:
      return "WRITE_STATE";
    case REPLAY_STATE:
      return "REPLAY_STATE";
    // Input opcodes
    case ST_OP_GET:
      return "ST_OP_GET";
    case ST_OP_PUT:
      return "ST_OP_PUT";
    case ST_OP_RMW:
      return "ST_OP_RMW";
    case ST_OP_INV:
      return "ST_OP_INV";
    case ST_OP_INV_ABORT:
      return "ST_OP_INV_ABORT";
    case ST_OP_ACK:
      return "ST_OP_ACK";
    case ST_OP_VAL:
      return "ST_OP_VAL";
    case ST_OP_CRD:
      return "ST_OP_CRD";
    case ST_OP_MEMBERSHIP_CHANGE:
      return "ST_OP_MEMBERSHIP_CHANGE";
    case ST_OP_MEMBERSHIP_COMPLETE:
      return "ST_OP_MEMBERSHIP_COMPLETE";
    // Response opcodes
    case ST_GET_COMPLETE:
      return "ST_GET_COMPLETE";
    case ST_PUT_SUCCESS:
      return "ST_PUT_SUCCESS";
    case ST_PUT_COMPLETE:
      return "ST_PUT_COMPLETE";
    case ST_RMW_SUCCESS:
      return "ST_RMW_SUCCESS";
    case ST_RMW_COMPLETE:
      return "ST_RMW_COMPLETE";
    case ST_REPLAY_SUCCESS:
      return "ST_REPLAY_SUCCESS";
    case ST_REPLAY_COMPLETE:
      return "ST_REPLAY_COMPLETE";
    case ST_INV_SUCCESS:
      return "ST_INV_SUCCESS";
    case ST_ACK_SUCCESS:
      return "ST_ACK_SUCCESS";
    case ST_VAL_SUCCESS:
      return "ST_VAL_SUCCESS";
    case ST_LAST_ACK_SUCCESS:
      return "ST_LAST_ACK_SUCCESS";
    case ST_LAST_ACK_NO_BCAST_SUCCESS:
      return "ST_LAST_ACK_NO_BCAST_SUCCESS";
    case ST_MISS:
      return "\033[31mST_MISS\033[0m";
    case ST_GET_STALL:
      return "ST_GET_STALL";
    case ST_PUT_STALL:
      return "ST_PUT_STALL";
    case ST_RMW_STALL:
      return "ST_RMW_STALL";
    case ST_RMW_ABORT:
      return "ST_RMW_ABORT";
    case ST_PUT_COMPLETE_SEND_VALS:
      return "ST_PUT_COMPLETE_SEND_VALS";
    case ST_RMW_COMPLETE_SEND_VALS:
      return "ST_RMW_COMPLETE_SEND_VALS";
    case ST_REPLAY_COMPLETE_SEND_VALS:
      return "ST_REPLAY_COMPLETE_SEND_VALS";
    case ST_INV_OUT_OF_GROUP:
      return "ST_INV_OUT_OF_GROUP";
    case ST_SEND_CRD:
      return "ST_SEND_CRD";
    // Ops bucket states
    case ST_EMPTY:
      return "ST_EMPTY";
    case ST_NEW:
      return "ST_NEW";
    case ST_IN_PROGRESS_PUT:
      return "ST_IN_PROGRESS_PUT";
    case ST_IN_PROGRESS_RMW:
      return "ST_IN_PROGRESS_RMW";
    case ST_IN_PROGRESS_REPLAY:
      return "ST_IN_PROGRESS_REPLAY";
    case ST_COMPLETE:
      return "ST_COMPLETE";
    // Buffer Types
    case ST_INV_BUFF:
      return "ST_INV_BUFF";
    case ST_ACK_BUFF:
      return "ST_ACK_BUFF";
    case ST_VAL_BUFF:
      return "ST_VAL_BUFF";
    case ST_CRD_BUFF:
      return "ST_CRD_BUFF";
    case NOP:
      return "NOP";
    // Failure related
    case ST_OP_HEARTBEAT:
      return "ST_OP_HEARTBEAT";
    case ST_OP_SUSPICION:
      return "ST_OP_SUSPICION";
    default: {
      printf("Wrong code (%d)\n", code);
      assert(0);
    }
  }
}

// Creates a trace with a uniform distribution without a backing file
void
create_uni_trace(struct spacetime_trace_command** cmds, int worker_gid)
{
  srand(time(NULL) + worker_gid * 7);
  *cmds =
      malloc((NUM_OF_REP_REQS + 1) * sizeof(struct spacetime_trace_command));
  int rmws = 0;

  uint32_t i, writes = 0;
  // parse file line by line and insert trace to cmd.
  for (i = 0; i < NUM_OF_REP_REQS; i++) {
    // Before reading the request deside if it's gone be read or write
    (*cmds)[i].opcode =
        (uint8_t)(update_ratio == 1000 || ((rand() % 1000 < update_ratio))
                      ? ST_OP_PUT
                      : ST_OP_GET);

    if (ENABLE_RMWs && (*cmds)[i].opcode == ST_OP_PUT)
      (*cmds)[i].opcode =
          (uint8_t)(rmw_ratio == 1000 || ((rand() % 1000 < rmw_ratio))
                        ? ST_OP_RMW
                        : ST_OP_PUT);

    if ((*cmds)[i].opcode == ST_OP_RMW) rmws++;
    if ((*cmds)[i].opcode == ST_OP_PUT) writes++;

    //--- KEY ID----------
    uint32 key_id = KEY_NUM != 0 ? (uint32)rand() % KEY_NUM
                                 : (uint32)rand() % SPACETIME_NUM_KEYS;
    if (USE_A_SINGLE_KEY == 1) key_id = 0;
    uint128 key_hash = CityHash128((char*)&(key_id), 4);
    //        memcpy(&(*cmds)[i].key_hash, &key_hash, 16); // this is for 16B
    //        keys
    memcpy(&(*cmds)[i].key_hash, &((uint64_t*)&key_hash)[1], 8);
    (*cmds)[i].key_id =
        (uint8_t)(key_id < 255 ? key_id : ST_KEY_ID_255_OR_HIGHER);
  }

  if (worker_gid % num_workers == 0)
    printf(
        "Update Ratio: %.2f%% (Writes|RMWs: %.2f%%|%.2f%%)\n"
        "Trace w_size %d \n",
        (double)((writes + rmws) * 100) / NUM_OF_REP_REQS,
        (double)(writes * 100) / NUM_OF_REP_REQS,
        (double)(rmws * 100) / NUM_OF_REP_REQS, NUM_OF_REP_REQS);
  (*cmds)[NUM_OF_REP_REQS].opcode = NOP;
  // printf("CLient %d Trace w_size: %d, debug counter %d hot keys %d, cold keys
  // %d \n",l_id, cmd_count, debug_cnt,
  //         t_stats[l_id].hot_keys_per_trace, t_stats[l_id].cold_keys_per_trace
  //         );
}

// Parse a trace, use this only for skewed workloads as uniform trace can be
// created (see create_uni_trace)
int
parse_trace(char* path, struct spacetime_trace_command** cmds, int worker_gid)
{
  FILE* fp;
  ssize_t read;
  size_t len = 0;
  char* ptr;
  char* word;
  char* saveptr;
  char* line = NULL;
  int rmws = 0;
  int writes = 0;
  int cmd_count = 0;
  uint32_t hottest_key_counter = 0;
  uint32_t ten_hottest_keys_counter = 0;
  uint32_t twenty_hottest_keys_counter = 0;

  fp = fopen(path, "r");
  if (fp == NULL) {
    printf("ERROR: Cannot open file: %s\n", path);
    exit(EXIT_FAILURE);
  }

  while ((read = getline(&line, &len, fp)) != -1)
    cmd_count++;

  //    printf("File %s has %d lines \n", path, cmd_count);

  fclose(fp);
  if (line) free(line);

  len = 0;
  line = NULL;

  fp = fopen(path, "r");
  if (fp == NULL) {
    printf("ERROR: Cannot open file: %s\n", path);
    exit(EXIT_FAILURE);
  }

  (*cmds) = malloc((cmd_count + 1) * sizeof(struct spacetime_trace_command));

  // Initialize random with a seed based on local time and a worker / machine id
  srand((unsigned int)(time(NULL) + worker_gid * 7));

  int debug_cnt = 0;
  // parse file line by line and insert trace to cmd.
  for (int i = 0; i < cmd_count; i++) {
    if ((read = getline(&line, &len, fp)) == -1) {
      printf("ERROR: Problem while reading the trace!\n");
      exit(1);
    }
    int word_count = 0;
    assert(word_count == 0);
    word = strtok_r(line, " ", &saveptr);

    // Before reading the request deside if it's gone be read or write
    (*cmds)[i].opcode =
        (uint8_t)(update_ratio == 1000 || ((rand() % 1000 < update_ratio))
                      ? ST_OP_PUT
                      : ST_OP_GET);

    if (ENABLE_RMWs && (*cmds)[i].opcode == ST_OP_PUT)
      (*cmds)[i].opcode =
          (uint8_t)(rmw_ratio == 1000 || ((rand() % 1000 < rmw_ratio))
                        ? ST_OP_RMW
                        : ST_OP_PUT);

    if ((*cmds)[i].opcode == ST_OP_PUT) writes++;
    if ((*cmds)[i].opcode == ST_OP_RMW) rmws++;

    while (word != NULL) {
      if (word[strlen(word) - 1] == '\n') word[strlen(word) - 1] = 0;

      if (word_count == 0) {
        uint32_t key_id = (uint32_t)strtoul(word, &ptr, 10);
        if (key_id == 0) hottest_key_counter++;
        if (key_id < 10) ten_hottest_keys_counter++;
        if (key_id < 20) twenty_hottest_keys_counter++;
        uint128 key_hash = CityHash128((char*)&(key_id), 4);
        //              memcpy(&(*cmds)[i].key_hash, &key_hash, 16); // this is
        //              for 16B keys
        memcpy(&(*cmds)[i].key_hash, &((uint64_t*)&key_hash)[1],
               8);  // this is for 8B keys
        (*cmds)[i].key_id =
            (uint8_t)(key_id < 255 ? key_id : ST_KEY_ID_255_OR_HIGHER);
        debug_cnt++;
      }

      word_count++;
      word = strtok_r(NULL, " ", &saveptr);
      if (word == NULL && word_count != 1) {
        printf("Client %d Error: Reached word %d in line %d : %s \n",
               worker_gid, word_count, i, line);
        assert(0);
      }
    }
  }

  if (worker_gid % num_workers == 0) {
    printf(
        "Trace size: %d | Hottest key (10 | 20 keys): %.2f%% (%.2f | %.2f "
        "%%)\n",
        cmd_count, (100 * hottest_key_counter / (double)cmd_count),
        (100 * ten_hottest_keys_counter / (double)cmd_count),
        (100 * twenty_hottest_keys_counter / (double)cmd_count));
    printf("Update Ratio: %.2f%% (Writes|RMWs: %.2f%%|%.2f%%)\n",
           (double)((writes + rmws) * 100) / cmd_count,
           (double)(writes * 100) / cmd_count,
           (double)(rmws * 100) / cmd_count);
  }
  (*cmds)[cmd_count].opcode = NOP;
  // printf("Thread %d Trace w_size: %d, debug counter %d hot keys %d, cold keys
  // %d \n",l_id, cmd_count, debug_cnt,
  //         t_stats[l_id].hot_keys_per_trace, t_stats[l_id].cold_keys_per_trace
  //         );
  assert(cmd_count == debug_cnt);
  fclose(fp);
  if (line) free(line);
  return cmd_count;
}

void
trace_init(struct spacetime_trace_command** trace, uint16_t worker_gid)
{
  // create the trace path path
  if (FEED_FROM_TRACE == 1) {
    char local_client_id[6];
    char machine_num[4];
    // get / create path for the trace
    sprintf(local_client_id, "%d", worker_gid % num_workers);
    sprintf(machine_num, "%d", machine_id);
    char path[2048];
    char cwd[1024];
    char* was_successful = getcwd(cwd, sizeof(cwd));

    if (!was_successful) {
      printf("ERROR: getcwd failed!\n");
      exit(EXIT_FAILURE);
    }

    double zipf_exponent = ZIPF_EXPONENT_OF_TRACE / 100.0;

    snprintf(path, sizeof(path), "%s%s%04d%s%.2f%s", cwd,
             "/../../traces/current-splitted-traces/t_", worker_gid, "_a_",
             zipf_exponent, ".txt");

    // initialize the command array from the trace file
    parse_trace(path, trace, worker_gid);
  } else
    create_uni_trace(trace, worker_gid);
}

// set up the OPS buffers
void
setup_kvs_buffs(spacetime_op_t** ops, spacetime_inv_t** inv_recv_ops,
                spacetime_ack_t** ack_recv_ops, spacetime_val_t** val_recv_ops)
{
  *ops = memalign(4096, MAX_BATCH_KVS_OPS_SIZE * (sizeof(spacetime_op_t)));
  memset(*ops, 0, MAX_BATCH_KVS_OPS_SIZE * (sizeof(spacetime_op_t)));
  assert(ops != NULL);

  // Dirty way to support ACKs that might be as big as INVs
  uint16_t ack_size =
      ENABLE_RMWs ? sizeof(spacetime_inv_t) : sizeof(spacetime_ack_t);
  spacetime_inv_t** rmw_ack_r_ops = (spacetime_inv_t**)ack_recv_ops;
  /// Network ops
  /// TODO should we memalign aswell?

  uint32_t no_ops =
      (uint32_t)(credits_num * MAX_REMOTE_MACHINES *
                 max_coalesce);  // credits * remote_machines * max_req_coalesce
  //    uint32_t no_ops = (uint32_t) (credits_num * remote_machine_num *
  //    max_coalesce); //credits * remote_machines * max_req_coalesce
  *inv_recv_ops = (spacetime_inv_t*)malloc(no_ops * sizeof(spacetime_inv_t));
  *ack_recv_ops = (spacetime_ack_t*)malloc(no_ops * ack_size);
  *val_recv_ops = (spacetime_val_t*)malloc(
      no_ops *
      sizeof(spacetime_val_t)); /* Batch of incoming broadcasts for the Cache*/
  assert(*inv_recv_ops != NULL && *ack_recv_ops != NULL &&
         *val_recv_ops != NULL);

  memset(*inv_recv_ops, 0, no_ops * sizeof(spacetime_inv_t));
  memset(*ack_recv_ops, 0, no_ops * sizeof(spacetime_ack_t));
  memset(*val_recv_ops, 0, no_ops * sizeof(spacetime_val_t));

  for (int i = 0; i < no_ops; ++i) {
    (*val_recv_ops)[i].opcode = ST_EMPTY;
    (*inv_recv_ops)[i].op_meta.opcode = ST_EMPTY;
    if (ENABLE_RMWs == 0)
      (*ack_recv_ops)[i].opcode = ST_EMPTY;
    else
      (*rmw_ack_r_ops)[i].op_meta.opcode = ST_EMPTY;
  }

  for (int i = 0; i < MAX_BATCH_KVS_OPS_SIZE; ++i) {
    (*ops)[i].op_meta.opcode = ST_EMPTY;
    (*ops)[i].op_meta.state = ST_EMPTY;
  }
}


================================================
FILE: src/mica-herd/city.c
================================================
// city.c - cityhash-c
// CityHash on C
// Copyright (c) 2011-2012, Alexander Nusov
//
// - original copyright notice -
// Copyright (c) 2011 Google, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
// CityHash, by Geoff Pike and Jyrki Alakuijala
//
// This file provides CityHash64() and related functions.
//
// It's probably possible to create even faster hash functions by
// writing a program that systematically explores some of the space of
// possible hash functions, by using SIMD instructions, or by
// compromising on hash quality.

#include "city.h"
#include <string.h>

static uint64
UNALIGNED_LOAD64(const char* p)
{
  uint64 result;
  memcpy(&result, p, sizeof(result));
  return result;
}

static uint32
UNALIGNED_LOAD32(const char* p)
{
  uint32 result;
  memcpy(&result, p, sizeof(result));
  return result;
}

#if !defined(WORDS_BIGENDIAN)

#define uint32_in_expected_order(x) (x)
#define uint64_in_expected_order(x) (x)

#else

#ifdef _MSC_VER
#include <stdlib.h>
#define bswap_32(x) _byteswap_ulong(x)
#define bswap_64(x) _byteswap_uint64(x)

#elif defined(__APPLE__)
// Mac OS X / Darwin features
#include <libkern/OSByteOrder.h>
#define bswap_32(x) OSSwapInt32(x)
#define bswap_64(x) OSSwapInt64(x)

#else
#include <byteswap.h>
#endif

#define uint32_in_expected_order(x) (bswap_32(x))
#define uint64_in_expected_order(x) (bswap_64(x))

#endif  // WORDS_BIGENDIAN

#if !defined(LIKELY)
#if HAVE_BUILTIN_EXPECT
#define LIKELY(x) (__builtin_expect(!!(x), 1))
#else
#define LIKELY(x) (x)
#endif
#endif

static uint64
Fetch64(const char* p)
{
  return uint64_in_expected_order(UNALIGNED_LOAD64(p));
}

static uint32
Fetch32(const char* p)
{
  return uint32_in_expected_order(UNALIGNED_LOAD32(p));
}

// Some primes between 2^63 and 2^64 for various uses.
static const uint64 k0 = 0xc3a5c85c97cb3127ULL;
static const uint64 k1 = 0xb492b66fbe98f273ULL;
static const uint64 k2 = 0x9ae16a3b2f90404fULL;
static const uint64 k3 = 0xc949d7c7509e6557ULL;

// Hash 128 input bits down to 64 bits of output.
// This is intended to be a reasonably good hash function.
static inline uint64
Hash128to64(const uint128 x)
{
  // Murmur-inspired hashing.
  const uint64 kMul = 0x9ddfea08eb382d69ULL;
  uint64 a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul;
  a ^= (a >> 47);
  uint64 b = (Uint128High64(x) ^ a) * kMul;
  b ^= (b >> 47);
  b *= kMul;
  return b;
}

// Bitwise right rotate.  Normally this will compile to a single
// instruction, especially if the shift is a manifest constant.
static uint64
Rotate(uint64 val, int shift)
{
  // Avoid shifting by 64: doing so yields an undefined result.
  return shift == 0 ? val : ((val >> shift) | (val << (64 - shift)));
}

// Equivalent to Rotate(), but requires the second arg to be non-zero.
// On x86-64, and probably others, it's possible for this to compile
// to a single instruction if both args are already in registers.
static uint64
RotateByAtLeast1(uint64 val, int shift)
{
  return (val >> shift) | (val << (64 - shift));
}

static uint64
ShiftMix(uint64 val)
{
  return val ^ (val >> 47);
}

static uint64
HashLen16(uint64 u, uint64 v)
{
  uint128 result;
  result.first = u;
  result.second = v;
  return Hash128to64(result);
}

static uint64
HashLen0to16(const char* s, size_t len)
{
  if (len > 8) {
    uint64 a = Fetch64(s);
    uint64 b = Fetch64(s + len - 8);
    return HashLen16(a, RotateByAtLeast1(b + len, (int)len)) ^ b;
  }
  if (len >= 4) {
    uint64 a = Fetch32(s);
    return HashLen16(len + (a << 3), Fetch32(s + len - 4));
  }
  if (len > 0) {
    uint8 a = (uint8)s[0];
    uint8 b = (uint8)s[len >> 1];
    uint8 c = (uint8)s[len - 1];
    uint32 y = (uint32)(a) + ((uint32)(b) << 8);
    uint32 z = (uint32)len + ((uint32)(c) << 2);
    return ShiftMix(y * k2 ^ z * k3) * k2;
  }
  return k2;
}

// This probably works well for 16-byte strings as well, but it may be overkill
// in that case.
static uint64
HashLen17to32(const char* s, size_t len)
{
  uint64 a = Fetch64(s) * k1;
  uint64 b = Fetch64(s + 8);
  uint64 c = Fetch64(s + len - 8) * k2;
  uint64 d = Fetch64(s + len - 16) * k0;
  return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d,
                   a + Rotate(b ^ k3, 20) - c + len);
}

// Return a 16-byte hash for 48 bytes.  Quick and dirty.
// Callers do best to use "random-looking" values for a and b.
// static pair<uint64, uint64> WeakHashLen32WithSeeds(
uint128
WeakHashLen32WithSeeds6(uint64 w, uint64 x, uint64 y, uint64 z, uint64 a,
                        uint64 b)
{
  a += w;
  b = Rotate(b + a + z, 21);
  uint64 c = a;
  a += x;
  a += y;
  b += Rotate(a, 44);

  uint128 result;
  result.first = (uint64)(a + z);
  result.second = (uint64)(b + c);
  return result;
}

// Return a 16-byte hash for s[0] ... s[31], a, and b.  Quick and dirty.
// static pair<uint64, uint64> WeakHashLen32WithSeeds(
uint128
WeakHashLen32WithSeeds(const char* s, uint64 a, uint64 b)
{
  return WeakHashLen32WithSeeds6(Fetch64(s), Fetch64(s + 8), Fetch64(s + 16),
                                 Fetch64(s + 24), a, b);
}

// Return an 8-byte hash for 33 to 64 bytes.
static uint64
HashLen33to64(const char* s, size_t len)
{
  uint64 z = Fetch64(s + 24);
  uint64 a = Fetch64(s) + (len + Fetch64(s + len - 16)) * k0;
  uint64 b = Rotate(a + z, 52);
  uint64 c = Rotate(a, 37);
  a += Fetch64(s + 8);
  c += Rotate(a, 7);
  a += Fetch64(s + 16);
  uint64 vf = a + z;
  uint64 vs = b + Rotate(a, 31) + c;
  a = Fetch64(s + 16) + Fetch64(s + len - 32);
  z = Fetch64(s + len - 8);
  b = Rotate(a + z, 52);
  c = Rotate(a, 37);
  a += Fetch64(s + len - 24);
  c += Rotate(a, 7);
  a += Fetch64(s + len - 16);
  uint64 wf = a + z;
  uint64 ws = b + Rotate(a, 31) + c;
  uint64 r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0);
  return ShiftMix(r * k0 + vs) * k2;
}

uint64
CityHash64(const char* s, size_t len)
{
  if (len <= 32) {
    if (len <= 16) {
      return HashLen0to16(s, len);
    } else {
      return HashLen17to32(s, len);
    }
  } else if (len <= 64) {
    return HashLen33to64(s, len);
  }

  // For strings over 64 bytes we hash the end first, and then as we
  // loop we keep 56 bytes of state: v, w, x, y, and z.
  uint64 x = Fetch64(s + len - 40);
  uint64 y = Fetch64(s + len - 16) + Fetch64(s + len - 56);
  uint64 z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24));
  uint64 temp;
  uint128 v = WeakHashLen32WithSeeds(s + len - 64, len, z);
  uint128 w = WeakHashLen32WithSeeds(s + len - 32, y + k1, x);
  x = x * k1 + Fetch64(s);

  // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks.
  len = (len - 1) & ~(size_t)(63);
  do {
    x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
    x ^= w.second;
    y += v.first + Fetch64(s + 40);
    z = Rotate(z + w.first, 33) * k1;
    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
    temp = z;
    z = x;
    x = temp;
    s += 64;
    len -= 64;
  } while (len != 0);
  return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z,
                   HashLen16(v.second, w.second) + x);
}

uint64
CityHash64WithSeed(const char* s, size_t len, uint64 seed)
{
  return CityHash64WithSeeds(s, len, k2, seed);
}

uint64
CityHash64WithSeeds(const char* s, size_t len, uint64 seed0, uint64 seed1)
{
  return HashLen16(CityHash64(s, len) - seed0, seed1);
}

// A subroutine for CityHash128().  Returns a decent 128-bit hash for strings
// of any length representable in signed long.  Based on City and Murmur.
static uint128
CityMurmur(const char* s, size_t len, uint128 seed)
{
  uint64 a = Uint128Low64(seed);
  uint64 b = Uint128High64(seed);
  uint64 c = 0;
  uint64 d = 0;
  signed long l = (signed long)(len - 16);
  if (l <= 0) {  // len <= 16
    a = ShiftMix(a * k1) * k1;
    c = b * k1 + HashLen0to16(s, len);
    d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c));
  } else {  // len > 16
    c = HashLen16(Fetch64(s + len - 8) + k1, a);
    d = HashLen16(b + len, c + Fetch64(s + len - 16));
    a += d;
    do {
      a ^= ShiftMix(Fetch64(s) * k1) * k1;
      a *= k1;
      b ^= a;
      c ^= ShiftMix(Fetch64(s + 8) * k1) * k1;
      c *= k1;
      d ^= c;
      s += 16;
      l -= 16;
    } while (l > 0);
  }
  a = HashLen16(a, c);
  b = HashLen16(d, b);

  uint128 result;
  result.first = (uint64)(a ^ b);
  result.second = (uint64)(HashLen16(b, a));
  return result;
}

uint128
CityHash128WithSeed(const char* s, size_t len, uint128 seed)
{
  if (len < 128) {
    return CityMurmur(s, len, seed);
  }

  // We expect len >= 128 to be the common case.  Keep 56 bytes of state:
  // v, w, x, y, and z.
  uint128 v, w;
  uint64 x = Uint128Low64(seed);
  uint64 y = Uint128High64(seed);
  uint64 z = len * k1;
  uint64 temp;
  v.first = Rotate(y ^ k1, 49) * k1 + Fetch64(s);
  v.second = Rotate(v.first, 42) * k1 + Fetch64(s + 8);
  w.first = Rotate(y + z, 35) * k1 + x;
  w.second = Rotate(x + Fetch64(s + 88), 53) * k1;

  // This is the same inner loop as CityHash64(), manually unrolled.
  do {
    x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
    x ^= w.second;
    y += v.first + Fetch64(s + 40);
    z = Rotate(z + w.first, 33) * k1;
    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
    temp = z;
    z = x;
    x = temp;
    s += 64;
    x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
    x ^= w.second;
    y += v.first + Fetch64(s + 40);
    z = Rotate(z + w.first, 33) * k1;
    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
    temp = z;
    z = x;
    x = temp;
    s += 64;
    len -= 128;
  } while (LIKELY(len >= 128));
  x += Rotate(v.first + z, 49) * k0;
  z += Rotate(w.first, 37) * k0;
  // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s.
  size_t tail_done;
  for (tail_done = 0; tail_done < len;) {
    tail_done += 32;
    y = Rotate(x + y, 42) * k0 + v.second;
    w.first += Fetch64(s + len - tail_done + 16);
    x = x * k0 + w.first;
    z += w.second + Fetch64(s + len - tail_done);
    w.second += v.first;
    v = WeakHashLen32WithSeeds(s + len - tail_done, v.first + z, v.second);
  }
  // At this point our 56 bytes of state should contain more than
  // enough information for a strong 128-bit hash.  We use two
  // different 56-byte-to-8-byte hashes to get a 16-byte final result.
  x = HashLen16(x, v.first);
  y = HashLen16(y + z, w.first);

  uint128 result;
  result.first = (uint64)(HashLen16(x + v.second, w.second) + y);
  result.second = (uint64)HashLen16(x + w.second, y + v.second);
  return result;
}

uint128
CityHash128(const char* s, size_t len)
{
  uint128 r;
  if (len >= 16) {
    r.first = (uint64)(Fetch64(s) ^ k3);
    r.second = (uint64)(Fetch64(s + 8));

    return CityHash128WithSeed(s + 16, len - 16, r);

  } else if (len >= 8) {
    r.first = (uint64)(Fetch64(s) ^ (len * k0));
    r.second = (uint64)(Fetch64(s + len - 8) ^ k1);

    return CityHash128WithSeed(NULL, 0, r);
  } else {
    r.first = (uint64)k0;
    r.second = (uint64)k1;
    return CityHash128WithSeed(s, len, r);
  }
}

#ifdef __SSE4_2__
#include <nmmintrin.h>
#include "citycrc.h"

// Requires len >= 240.
static void
CityHashCrc256Long(const char* s, size_t len, uint32 seed, uint64* result)
{
  uint64 a = Fetch64(s + 56) + k0;
  uint64 b = Fetch64(s + 96) + k0;
  uint64 c = result[0] = HashLen16(b, len);
  uint64 d = result[1] = Fetch64(s + 120) * k0 + len;
  uint64 e = Fetch64(s + 184) + seed;
  uint64 f = seed;
  uint64 g = 0;
  uint64 h = 0;
  uint64 i = 0;
  uint64 j = 0;
  uint64 t = c + d;

  // 240 bytes of input per iter.
  size_t iters = len / 240;
  len -= iters * 240;
  do {
#define CHUNK(multiplier, z)                              \
  {                                                       \
    uint64 old_a = a;                                     \
    a = Rotate(b, 41 ^ z) * multiplier + Fetch64(s);      \
    b = Rotate(c, 27 ^ z) * multiplier + Fetch64(s + 8);  \
    c = Rotate(d, 41 ^ z) * multiplier + Fetch64(s + 16); \
    d = Rotate(e, 33 ^ z) * multiplier + Fetch64(s + 24); \
    e = Rotate(t, 25 ^ z) * multiplier + Fetch64(s + 32); \
    t = old_a;                                            \
  }                                                       \
  f = _mm_crc32_u64(f, a);                                \
  g = _mm_crc32_u64(g, b);                                \
  h = _mm_crc32_u64(h, c);                                \
  i = _mm_crc32_u64(i, d);                                \
  j = _mm_crc32_u64(j, e);                                \
  s += 40

    CHUNK(1, 1);
    CHUNK(k0, 0);
    CHUNK(1, 1);
    CHUNK(k0, 0);
    CHUNK(1, 1);
    CHUNK(k0, 0);
  } while (--iters > 0);

  while (len >= 40) {
    CHUNK(k0, 0);
    len -= 40;
  }
  if (len > 0) {
    s = s + len - 40;
    CHUNK(k0, 0);
  }
  j += i << 32;
  a = HashLen16(a, j);
  h += g << 32;
  b += h;
  c = HashLen16(c, f) + i;
  d = HashLen16(d, e + result[0]);
  j += e;
  i += HashLen16(h, t);
  e = HashLen16(a, d) + j;
  f = HashLen16(b, c) + a;
  g = HashLen16(j, i) + c;
  result[0] = e + f + g + h;
  a = ShiftMix((a + g) * k0) * k0 + b;
  result[1] += a + result[0];
  a = ShiftMix(a * k0) * k0 + c;
  result[2] = a + result[1];
  a = ShiftMix((a + e) * k0) * k0;
  result[3] = a + result[2];
}

// Requires len < 240.
static void
CityHashCrc256Short(const char* s, size_t len, uint64* result)
{
  char buf[240];
  memcpy(buf, s, len);
  memset(buf + len, 0, 240 - len);
  CityHashCrc256Long(buf, 240, ~(uint32)(len), result);
}

void
CityHashCrc256(const char* s, size_t len, uint64* result)
{
  if (LIKELY(len >= 240)) {
    CityHashCrc256Long(s, len, 0, result);
  } else {
    CityHashCrc256Short(s, len, result);
  }
}

uint128
CityHashCrc128WithSeed(const char* s, size_t len, uint128 seed)
{
  if (len <= 900) {
    return CityHash128WithSeed(s, len, seed);
  } else {
    uint64 result[4];
    CityHashCrc256(s, len, result);
    uint64 u = Uint128High64(seed) + result[0];
    uint64 v = Uint128Low64(seed) + result[1];
    uint128 crc;
    crc.first = (uint64)(HashLen16(u, v + result[2]));
    crc.second = (uint64)(HashLen16(Rotate(v, 32), u * k0 + result[3]));
    return crc;
  }
}

uint128
CityHashCrc128(const char* s, size_t len)
{
  if (len <= 900) {
    return CityHash128(s, len);
  } else {
    uint64 result[4];
    CityHashCrc256(s, len, result);
    uint128 crc;
    crc.first = (uint64)result[2];
    crc.second = (uint64)result[3];
    return crc;
  }
}

#endif


================================================
FILE: src/mica-herd/herd.c
================================================
#include "hrd.h"

/* Every thread creates a TCP connection to the registry only once. */
__thread memcached_st* memc = NULL;

/*
 * Finds the port with rank `port_index` (0-based) in the list of ENABLED ports.
 * Fills its device id and device-local port id (1-based) into the supplied
 * control block.
 */

char dev_name[50];
struct ibv_device*
hrd_resolve_port_index(struct hrd_ud_ctrl_blk* cb, int port_index)
{
  struct ibv_device** dev_list;
  int num_devices = 0;

  assert(port_index >= 0);

  cb->device_id = -1;
  cb->dev_port_id = -1;

  dev_list = ibv_get_device_list(&num_devices);
  CPE(!dev_list, "HRD: Failed to get IB devices list", 0);

  int ports_to_discover = port_index;
  int dev_i;

  for (dev_i = 0; dev_i < num_devices; dev_i++) {
    if (strcmp(dev_list[dev_i]->name, dev_name) != 0) continue;

    struct ibv_context* ctx = ibv_open_device(dev_list[dev_i]);
    CPE(!ctx, "HRD: Couldn't open device", 0);

    struct ibv_device_attr device_attr;
    memset(&device_attr, 0, sizeof(device_attr));
    if (ibv_query_device(ctx, &device_attr)) {
      printf("HRD: Could not query device: %d\n", dev_i);
      exit(-1);
    }

    uint8_t port_i;
    for (port_i = 1; port_i <= device_attr.phys_port_cnt; port_i++) {
      /* Count this port only if it is enabled */
      struct ibv_port_attr port_attr;
      if (ibv_query_port(ctx, port_i, &port_attr) != 0) {
        printf("HRD: Could not query port %d of device %d\n", port_i, dev_i);
        exit(-1);
      }

      if (port_attr.phys_state != IBV_PORT_ACTIVE &&
          port_attr.phys_state != IBV_PORT_ACTIVE_DEFER) {
#ifndef __cplusplus
        printf("HRD: Ignoring port %d of device %d. State is %s\n", port_i,
               dev_i, ibv_port_state_str(port_attr.phys_state));
#else
        printf("HRD: Ignoring port %d of device %d. State is %s\n", port_i,
               dev_i, ibv_port_state_str((ibv_port_state)port_attr.phys_state));
#endif
        continue;
      }

      if (ports_to_discover == 0) {
        // printf("HRD: port index %d resolved to device %d, port %d\n",
        // 	port_index, dev_i, port_i);

        /* Fill the device ID and device-local port ID */
        cb->device_id = dev_i;
        cb->dev_port_id = port_i;

        if (ibv_close_device(ctx)) {
          fprintf(stderr, "HRD: Couldn't release context\n");
          assert(false);
        }

        return dev_list[cb->device_id];
      }

      ports_to_discover--;
    }

    if (ibv_close_device(ctx)) {
      fprintf(stderr, "HRD: Couldn't release context\n");
      assert(false);
    }
  }

  /* If we come here, port resolution failed */
  assert(cb->device_id == -1 && cb->dev_port_id == -1);
  fprintf(stderr, "HRD: Invalid port index %d. Exiting.\n", port_index);
  exit(-1);
}

/* Allocate SHM with @shm_key, and save the shmid into @shm_id_ret */
void*
hrd_malloc_socket(int shm_key, uint64_t size, int socket_id)
{
  int shmid;
  int shm_flag =
      IPC_CREAT | IPC_EXCL | 0666 | (USE_HUGE_PAGES == 1 ? SHM_HUGETLB : 0);

  shmid = shmget(shm_key, size, shm_flag);

  if (shmid == -1) {
    switch (errno) {
      case EACCES:
        colored_printf(RED,
                       "HRD: SHM malloc error: Insufficient permissions."
                       " (SHM key = %d)\n",
                       shm_key);
        break;
      case EEXIST:
        colored_printf(RED,
                       "HRD: SHM malloc error: Already exists."
                       " (SHM key = %d)\n",
                       shm_key);
        break;
      case EINVAL:
        colored_printf(RED,
                       "HRD: SHM malloc error: SHMMAX/SHMIN mismatch."
                       " (SHM key = %d, size = %lu)\n",
                       shm_key, size);
        break;
      case ENOMEM:
        colored_printf(RED,
                       "HRD: SHM malloc error: Insufficient memory."
                       " (SHM key = %d, size = %lu)\n",
                       shm_key, size);
        break;
      case ENOENT:
        colored_printf(RED,
                       "HRD: SHM malloc error: No segment exists for the given "
                       "key, and IPC_CREAT was not specified."
                       " (SHM key = %d, size = %lu)\n",
                       shm_key, size);
        break;
      case ENOSPC:
        colored_printf(
            RED,
            "HRD: SHM malloc error: All possible shared memory IDs have been "
            "taken or the limit of shared memory is exceeded."
            " (SHM key = %d, size = %lu)\n",
            shm_key, size);
        break;
      case EPERM:
        colored_printf(RED,
                       "HRD: SHM malloc error: The SHM_HUGETLB flag was "
                       "specified, but the caller was not privileged"
                       " (SHM key = %d, size = %lu)\n",
                       shm_key, size);
        break;
      case ENFILE:
        colored_printf(RED,
                       "HRD: SHM malloc error: The system-wide limit on the "
                       "total number of open files has been reached."
                       " (SHM key = %d, size = %lu)\n",
                       shm_key, size);
        break;
      default:
        colored_printf(RED, "HRD: SHM malloc error: A wild SHM error: %s.\n",
                       strerror(errno));
        break;
    }
    assert(false);
  }

  void* buf = shmat(shmid, NULL, 0);
  if (buf == NULL) {
    printf("HRD: SHM malloc error: shmat() failed for key %d\n", shm_key);
    exit(-1);
  }

  /* Bind the buffer to this socket */
  const unsigned long nodemask = (1 << socket_id);
  int ret = mbind(buf, size, MPOL_BIND, &nodemask, 32, 0);
  if (ret != 0) {
    printf("HRD: SHM malloc error. mbind() failed for key %d\n", shm_key);
    exit(-1);
  }

  // vasilis- try to take advantage of TLB coalescing, if it is there
  if (LEVERAGE_TLB_COALESCING) {
    uint64_t page_no = CEILING(size, HUGE_PAGE_SIZE) - 1;
    for (uint64_t i = 0; i < page_no; i++) {
      uint8_t* buf_ptr = ((uint8_t*)buf) + (i * HUGE_PAGE_SIZE);
      memset(buf_ptr, 0, 1);
    }
  }

  return buf;
}

/* Free shm @shm_key and @shm_buf. Return 0 on success, else -1. */
int
hrd_free(int shm_key, void* shm_buf)
{
  int ret;
  int shmid = shmget(shm_key, 0, 0);
  if (shmid == -1) {
    switch (errno) {
      case EACCES:
        printf(
            "HRD: SHM free error: Insufficient permissions."
            " (SHM key = %d)\n",
            shm_key);
        break;
      case ENOENT:
        printf("HRD: SHM free error: No such SHM key. (SHM key = %d)\n",
               shm_key);
        break;
      default:
        printf("HRD: SHM free error: A wild SHM error: %s\n", strerror(errno));
        break;
    }
    return -1;
  }

  ret = shmctl(shmid, IPC_RMID, NULL);
  if (ret != 0) {
    printf("HRD: SHM free error: shmctl() failed for key %d\n", shm_key);
    exit(-1);
  }

  ret = shmdt(shm_buf);
  if (ret != 0) {
    printf("HRD: SHM free error: shmdt() failed for key %d\n", shm_key);
    exit(-1);
  }

  return 0;
}

/* Get the LID of a port on the device specified by @ctx */
uint16_t
hrd_get_local_lid(struct ibv_context* ctx, int dev_port_id)
{
  assert(ctx != NULL && dev_port_id >= 1);

  struct ibv_port_attr attr;
  if (ibv_query_port(ctx, dev_port_id, &attr)) {
    printf("HRD: ibv_query_port on port %d of device %s failed! Exiting.\n",
           dev_port_id, ibv_get_device_name(ctx->device));
    assert(false);
  }

  return attr.lid;
}

/* Return the environment variable @name if it is set. Exit if not. */
char*
hrd_getenv(const char* name)
{
  char* env = getenv(name);
  if (env == NULL) {
    fprintf(stderr, "Environment variable %s not set\n", name);
    assert(false);
  }

  return env;
}

memcached_st*
hrd_create_memc()
{
  memcached_server_st* servers = NULL;
  memcached_st* memc = memcached_create(NULL);
  memcached_return rc;
  memc = memcached_create(NULL);

  char* registry_ip = hrd_getenv("HRD_REGISTRY_IP");
  //	printf("Appending server with IP: %s \n", registry_ip);
  servers = memcached_server_list_append(servers, registry_ip,
                                         MEMCACHED_DEFAULT_PORT, &rc);
  // Pushes an array of memcached_server_st into the memcached_st structure.
  // These servers will be placed at the end.
  rc = memcached_server_push(memc, servers);
  CPE(rc != MEMCACHED_SUCCESS, "Couldn't add memcached server.\n", -1);

  return memc;
}

/*
 * Insert key -> value mapping into memcached running at HRD_REGISTRY_IP.
 */
void
hrd_publish(const char* key, void* value, int len)
{
  assert(key != NULL && value != NULL && len > 0);
  memcached_return rc;

  if (memc == NULL) {
    memc = hrd_create_memc();
  }

  rc = memcached_set(memc, key, strlen(key), (const char*)value, len, (time_t)0,
                     (uint32_t)0);
  if (rc != MEMCACHED_SUCCESS) {
    char* registry_ip = hrd_getenv("HRD_REGISTRY_IP");
    fprintf(stderr,
            "\tHRD: Failed to publish key %s to memcached. Error %s. "
            "Reg IP = %s\n",
            key, memcached_strerror(memc, rc), registry_ip);
    exit(-1);
  }
}

/*
 * Get the value associated with "key" into "value", and return the length
 * of the value. If the key is not found, return NULL and len -1. For all
 * other errors, terminate.
 *
 * This function sometimes gets called in a polling loop - ensure that there
 * are no memory leaks or unterminated memcached connections! We don't need
 * to free() the resul of getenv() since it points to a string in the process
 * environment.
 */
int
hrd_get_published(const char* key, void** value)
{
  assert(key != NULL);
  if (memc == NULL) {
    memc = hrd_create_memc();
  }

  memcached_return rc;
  size_t value_length;
  uint32_t flags;

  *value = memcached_get(memc, key, strlen(key), &value_length, &flags, &rc);

  if (rc == MEMCACHED_SUCCESS) {
    return (int)value_length;
  } else if (rc == MEMCACHED_NOTFOUND) {
    assert(*value == NULL);
    return -1;
  } else {
    char* registry_ip = hrd_getenv("HRD_REGISTRY_IP");
    // char *registry_ip = is_client == 1 ? remote_IP : local_IP;
    fprintf(stderr,
            "HRD: Error finding value for key \"%s\": %s. "
            "Reg IP = %s\n",
            key, memcached_strerror(memc, rc), registry_ip);
    exit(-1);
  }

  /* Never reached */
  assert(false);
}

/*
 * If @prealloc_conn_buf != NULL, @conn_buf_size is the size of the preallocated
 * buffer. If @prealloc_conn_buf == NULL, @conn_buf_size is the size of the
 * new buffer to create.
 */
struct hrd_ud_ctrl_blk*
hrd_ud_ctrl_blk_init(int local_hid, int port_index,
                     int numa_node_id, /* -1 means don't use hugepages */
                     int num_dgram_qps, int dgram_buf_size,
                     int dgram_buf_shm_key, int* recv_q_depth,
                     int* send_q_depth)
{
  // colored_printf(RED,"HRD: creating control block %d: port %d, socket %d, "
  // 	"conn qps %d, UC %d, conn buf %d bytes (key %d), "
  // 	"dgram qps %d, dgram buf %d bytes (key %d)\n",
  // 	local_hid, port_index, numa_node_id,
  // 	num_conn_qps, use_uc, conn_buf_size, conn_buf_shm_key,
  // 	num_dgram_qps, dgram_buf_size, dgram_buf_shm_key);

  /*
   * Check arguments for sanity.
   * @local_hid can be anything: it's just control block identifier that is
   * useful in printing debug info.
   */
  assert(port_index >= 0 && port_index <= 16);
  assert(numa_node_id >= -1 && numa_node_id <= 8);
  assert(num_dgram_qps >= 0 && num_dgram_qps <= M_2);
  assert(dgram_buf_size >= 0 && dgram_buf_size <= M_1024);

  if (num_dgram_qps == 0) {
    colored_printf(RED,
                   "HRD: Control block initialization without QPs. Are you"
                   " sure you want to do this?\n");
    assert(false);
  }

  struct hrd_ud_ctrl_blk* cb =
      (struct hrd_ud_ctrl_blk*)malloc(sizeof(struct hrd_ud_ctrl_blk));
  memset(cb, 0, sizeof(struct hrd_ud_ctrl_blk));

  /* Fill in the control block */
  cb->local_hid = local_hid;
  cb->numa_node_id = numa_node_id;

  cb->num_dgram_qps = num_dgram_qps;
  cb->dgram_buf_shm_key = dgram_buf_shm_key;

  cb->recv_q_depth = recv_q_depth;
  cb->send_q_depth = send_q_depth;

  /* Get the device to use. This fills in cb->device_id and cb->dev_port_id */
  struct ibv_device* ib_dev = hrd_resolve_port_index(cb, port_index);
  CPE(!ib_dev, "HRD: IB device not found", 0);

  /* Use a single device context and PD for all QPs */
  cb->ctx = ibv_open_device(ib_dev);
  CPE(!cb->ctx, "HRD: Couldn't get context", 0);

  cb->pd = ibv_alloc_pd(cb->ctx);
  CPE(!cb->pd, "HRD: Couldn't allocate PD", 0);

  int ib_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ |
                 IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC;

  /*
   * Create datagram QPs and transition them RTS.
   * Create and register datagram RDMA buffer.
   */
  if (num_dgram_qps >= 1) {
    cb->dgram_qp =
        (struct ibv_qp**)malloc(num_dgram_qps * sizeof(struct ibv_qp*));
    cb->dgram_send_cq =
        (struct ibv_cq**)malloc(num_dgram_qps * sizeof(struct ibv_cq*));
    cb->dgram_recv_cq =
        (struct ibv_cq**)malloc(num_dgram_qps * sizeof(struct ibv_cq*));

    assert(cb->dgram_qp != NULL && cb->dgram_send_cq != NULL &&
           cb->dgram_recv_cq != NULL);

    hrd_create_dgram_qps(cb);

    /* Create and register dgram_buf */
    int reg_size = 0;

    if (numa_node_id >= 0) {
      /* Hugepages */
      while (reg_size < dgram_buf_size) {
        reg_size += M_2;
      }

      /* SHM key 0 is hard to free later */
      assert(dgram_buf_shm_key >= 1 && dgram_buf_shm_key <= 128);
      cb->dgram_buf = (volatile uint8_t*)hrd_malloc_socket(
          dgram_buf_shm_key, reg_size, numa_node_id);
    } else {
      reg_size = dgram_buf_size;
      cb->dgram_buf = (volatile uint8_t*)memalign(4096, reg_size);
    }

    assert(cb->dgram_buf != NULL);
    memset((char*)cb->dgram_buf, 0, reg_size);

    cb->dgram_buf_mr =
        ibv_reg_mr(cb->pd, (char*)cb->dgram_buf, reg_size, ib_flags);
    assert(cb->dgram_buf_mr != NULL);
  }

  return cb;
}

/* Free up the resources taken by @cb. Return -1 if something fails, else 0. */
int
hrd_ud_ctrl_blk_destroy(struct hrd_ud_ctrl_blk* cb)
{
  int i;
  colored_printf(RED, "HRD: Destroying control block %d\n", cb->local_hid);

  /* Destroy QPs and CQs. QPs must be destroyed before CQs. */
  for (i = 0; i < cb->num_dgram_qps; i++) {
    assert(cb->dgram_send_cq[i] != NULL && cb->dgram_recv_cq[i] != NULL);
    assert(cb->dgram_qp[i] != NULL);

    if (ibv_destroy_qp(cb->dgram_qp[i])) {
      fprintf(stderr, "HRD: Couldn't destroy dgram QP %d\n", i);
      return -1;
    } else
      assert(0);
  }

  /* Destroy memory regions */
  if (cb->num_dgram_qps > 0) {
    assert(cb->dgram_buf_mr != NULL && cb->dgram_buf != NULL);
    if (ibv_dereg_mr(cb->dgram_buf_mr)) {
      fprintf(stderr, "HRD: Couldn't deregister dgram MR for cb %d\n",
              cb->local_hid);
      return -1;
    }

    if (cb->numa_node_id >= 0) {
      if (hrd_free(cb->dgram_buf_shm_key, (void*)cb->dgram_buf)) {
        fprintf(stderr, "HRD: Error freeing dgram hugepages for cb %d\n",
                cb->local_hid);
      }
    } else {
      free((void*)cb->dgram_buf);
    }
  }

  /* Destroy protection domain */
  if (ibv_dealloc_pd(cb->pd)) {
    fprintf(stderr, "HRD: Couldn't dealloc PD for cb %d\n", cb->local_hid);
    return -1;
  }

  /* Destroy device context */
  if (ibv_close_device(cb->ctx)) {
    fprintf(stderr, "Couldn't release context for cb %d\n", cb->local_hid);
    return -1;
  }

  colored_printf(RED, "HRD: Control block %d destroyed.\n", cb->local_hid);
  return 0;
}

/* Create datagram QPs and transition them to RTS */
void
hrd_create_dgram_qps(struct hrd_ud_ctrl_blk* cb)
{
  int i;
  assert(cb->dgram_qp != NULL && cb->dgram_send_cq != NULL &&
         cb->dgram_recv_cq != NULL && cb->pd != NULL && cb->ctx != NULL);
  assert(cb->num_dgram_qps >= 1 && cb->dev_port_id >= 1);

  for (i = 0; i < cb->num_dgram_qps; i++) {
    cb->dgram_send_cq[i] =
        ibv_create_cq(cb->ctx, cb->send_q_depth[i], NULL, NULL, 0);
    assert(cb->dgram_send_cq[i] != NULL);

    // <vasilis> I am replacing the recv_cq Depth
    // cb->dgram_recv_cq[i] = ibv_create_cq(cb->ctx,
    // 	HRD_Q_DEPTH, NULL, NULL, 0);
    cb->dgram_recv_cq[i] =
        ibv_create_cq(cb->ctx, cb->recv_q_depth[i], NULL, NULL, 0);
    assert(cb->dgram_recv_cq[i] != NULL);
    // </vasilis>

    /* Initialize creation attributes */
    struct ibv_qp_init_attr create_attr;
    memset((void*)&create_attr, 0, sizeof(struct ibv_qp_init_attr));
    // if (i > 0) printf("The recv queue %d has size %d, the send queue has size
    // 		%d\n", i, cb->recv_q_depth[i], cb->send_q_depth[i] );
    create_attr.send_cq = cb->dgram_send_cq[i];
    create_attr.recv_cq = cb->dgram_recv_cq[i];
    create_attr.qp_type = IBV_QPT_UD;

    create_attr.cap.max_send_wr = cb->send_q_depth[i];
    // <vasilis>
    // printf("Receive q depth %d\n", cb->recv_q_depth);
    create_attr.cap.max_recv_wr = cb->recv_q_depth[i];
    // </vasilis>
    create_attr.cap.max_send_sge = 1;
    create_attr.cap.max_recv_sge = 1;
    create_attr.cap.max_inline_data = HRD_MAX_INLINE;

    cb->dgram_qp[i] = ibv_create_qp(cb->pd, &create_attr);
    assert(cb->dgram_qp[i] != NULL);

    /* INIT state */
    struct ibv_qp_attr init_attr;
    memset((void*)&init_attr, 0, sizeof(struct ibv_qp_attr));
    init_attr.qp_state = IBV_QPS_INIT;
    init_attr.pkey_index = 0;
    init_attr.port_num = cb->dev_port_id;
    init_attr.qkey = HRD_DEFAULT_QKEY;

    if (ibv_modify_qp(
            cb->dgram_qp[i], &init_attr,
            IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY)) {
      fprintf(stderr, "Failed to modify dgram QP to INIT\n");
      return;
    }

    /* RTR state */
    struct ibv_qp_attr rtr_attr;
    memset((void*)&rtr_attr, 0, sizeof(struct ibv_qp_attr));
    rtr_attr.qp_state = IBV_QPS_RTR;

    if (ibv_modify_qp(cb->dgram_qp[i], &rtr_attr, IBV_QP_STATE)) {
      fprintf(stderr, "Failed to modify dgram QP to RTR\n");
      exit(-1);
    }

    /* Reuse rtr_attr for RTS */
    rtr_attr.qp_state = IBV_QPS_RTS;
    rtr_attr.sq_psn = HRD_DEFAULT_PSN;

    if (ibv_modify_qp(cb->dgram_qp[i], &rtr_attr,
                      IBV_QP_STATE | IBV_QP_SQ_PSN)) {
      fprintf(stderr, "Failed to modify dgram QP to RTS\n");
      exit(-1);
    }
  }
}

void
hrd_publish_dgram_qp(struct hrd_ud_ctrl_blk* cb, int n, const char* qp_name,
                     uint8_t sl)
{
  assert(cb != NULL);
  assert(n >= 0 && n < cb->num_dgram_qps);

  assert(qp_name != NULL && strlen(qp_name) < HRD_QP_NAME_SIZE - 1);
  assert(strstr(qp_name, HRD_RESERVED_NAME_PREFIX) == NULL);

  int len = strlen(qp_name);
  int i;
  for (i = 0; i < len; i++) {
    if (qp_name[i] == ' ') {
      fprintf(stderr, "HRD: Space not allowed in QP name\n");
      exit(-1);
    }
  }

  struct hrd_qp_attr qp_attr;
  memcpy(qp_attr.name, qp_name, len);
  qp_attr.name[len] = 0; /* Add the null terminator */
  qp_attr.lid = hrd_get_local_lid(cb->dgram_qp[n]->context, cb->dev_port_id);
  qp_attr.qpn = cb->dgram_qp[n]->qp_num;
  qp_attr.sl = sl;

  // <Vasilis>  ---ROCE----------
  if (is_roce == 1) {
    union ibv_gid ret_gid;
    ibv_query_gid(cb->ctx, IB_PHYS_PORT, 0, &ret_gid);
    qp_attr.gid_global_interface_id = ret_gid.global.interface_id;
    qp_attr.gid_global_subnet_prefix = ret_gid.global.subnet_prefix;
  }
  // printf("Publishing datagram qp with name %s \n", qp_attr.name);
  // </vasilis>

  hrd_publish(qp_attr.name, &qp_attr, sizeof(struct hrd_qp_attr));
}

struct hrd_qp_attr*
hrd_get_published_qp(const char* qp_name)
{
  struct hrd_qp_attr* ret;
  assert(qp_name != NULL && strlen(qp_name) < HRD_QP_NAME_SIZE - 1);
  assert(strstr(qp_name, HRD_RESERVED_NAME_PREFIX) == NULL);

  int len = strlen(qp_name);
  int i;
  for (i = 0; i < len; i++) {
    if (qp_name[i] == ' ') {
      fprintf(stderr, "HRD: Space not allowed in QP name\n");
      exit(-1);
    }
  }

  int ret_len = hrd_get_published(qp_name, (void**)&ret);

  /*
   * The registry lookup returns only if we get a unique QP for @qp_name, or
   * if the memcached lookup succeeds but we don't have an entry for @qp_name.
   */
  assert(ret_len == sizeof(struct hrd_qp_attr) || ret_len == -1);

  return ret;
}

//////////////////////////
/// Fun-c-print
//////////////////////////

/* Like printf, but colorfur. Limited to 1000 characters. */
void
colored_printf(color_print_t color, const char* format, ...)
{
#define RED_LIM 1000
  va_list args;
  int i;

  char buf1[RED_LIM], buf2[RED_LIM];
  memset(buf1, 0, RED_LIM);
  memset(buf2, 0, RED_LIM);

  va_start(args, format);

  /* Marshal the stuff to print in a buffer */
  vsnprintf(buf1, RED_LIM, format, args);

  /* Probably a bad check for buffer overflow */
  for (i = RED_LIM - 1; i >= RED_LIM - 50; i--) {
    assert(buf1[i] == 0);
  }

  /* Add markers for red color and reset color */
  // snprintf(buf2, 1000, "\033[31m%s\033[0m", buf1);
  snprintf(buf2, 1000, "\033[31m%s\033[0m", buf1);
  switch (color) {
    case YELLOW:
      snprintf(buf2, 1000, "\033[33m%s\033[0m", buf1);
      break;
    case RED:
      snprintf(buf2, 1000, "\033[31m%s\033[0m", buf1);
      break;
    case GREEN:
      snprintf(buf2, 1000, "\033[32m%s\033[0m", buf1);
      break;
    case CYAN:
      snprintf(buf2, 1000, "\033[36m%s\033[0m", buf1);
      break;
    default:
      printf("Wrong printf color /%d \n", color);
      assert(false);
  }

  /* Probably another bad check for buffer overflow */
  for (i = RED_LIM - 1; i >= RED_LIM - 50; i--) {
    assert(buf2[i] == 0);
  }

  printf("%s", buf2);

  va_end(args);
}


================================================
FILE: src/mica-herd/mica.c
================================================
#include "mica.h"
#include "hrd.h"

int
is_power_of_2(int x)
{
  return (x == 1 || x == 2 || x == 4 || x == 8 || x == 16 || x == 32 ||
          x == 64 || x == 128 || x == 256 || x == 512 || x == 1024 ||
          x == 2048 || x == 4096 || x == 8192 || x == 16384 || x == 32768 ||
          x == 65536 || x == 131072 || x == 262144 || x == 524288 ||
          x == 1048576 || x == 2097152 || x == 4194304 || x == 8388608 ||
          x == 16777216 || x == 33554432 || x == 67108864 || x == 134217728 ||
          x == 268435456 || x == 536870912 || x == 1073741824);
}

void
mica_init(struct mica_kv* kv, int instance_id, int node_id, int num_bkts,
          uint64_t log_cap)
{
  int i, j;

  /* Verify struct sizes */
  assert(sizeof(struct mica_slot) == 8);
  assert(sizeof(struct mica_key) == 16);
  assert(sizeof(struct mica_op) % 64 == 0);

  assert(kv != NULL);
  assert(node_id == 0 || node_id == 1);

  /* 16 million buckets = a 1 GB index */
  assert(is_power_of_2(num_bkts) == 1 && num_bkts <= M_128);
  // assert(log_cap > 0 && log_cap <= M_1024 &&
  //	log_cap % M_2 == 0 && is_power_of_2(log_cap));

  assert(MICA_LOG_BITS >= 24); /* Minimum log size = 16 MB */

  // red_printf("mica-herd-herd: Initializing MICA instance %d.\n"
  // 	"NUMA node = %d, buckets = %d (size = %u B), log capacity = %d B.\n",
  // 	instance_id,
  // 	node_id, num_bkts, num_bkts * sizeof(struct mica_bkt), log_cap);

  if (MICA_DEBUG != 0) {
    printf(
        "mica-herd-herd: Debug mode is ON! This might reduce performance.\n");
    sleep(2);
  }

  /* Initialize metadata and stats */
  kv->instance_id = instance_id;

  kv->num_bkts = num_bkts;
  kv->bkt_mask = num_bkts - 1; /* num_bkts is power of 2 */

  kv->log_cap = log_cap;
  kv->log_mask = log_cap - 1; /* log_cap is a power of 2 */
  kv->log_head = 0;

  kv->num_insert_op = 0;
  kv->num_index_evictions = 0;

  /* Alloc index and initialize all entries to invalid */
  // printf("mica-herd-herd: Allocting hash table index for instance %d\n",
  // instance_id);
  int ht_index_key = MICA_INDEX_SHM_KEY + instance_id;
  kv->ht_index = (struct mica_bkt*)hrd_malloc_socket(
      ht_index_key, num_bkts * sizeof(struct mica_bkt), node_id);

  for (i = 0; i < num_bkts; i++) {
    for (j = 0; j < 8; j++) {
      kv->ht_index[i].slots[j].in_use = 0;
    }
  }

  /* Alloc log */
  // printf("mica-herd-herd: Allocting hash table log for instance %d\n",
  // instance_id);
  int ht_log_key = MICA_LOG_SHM_KEY + instance_id;
  kv->ht_log = (uint8_t*)hrd_malloc_socket(ht_log_key, log_cap, node_id);
}

void
mica_insert_one(struct mica_kv* kv, struct mica_op* op, struct mica_resp* resp)
{
#if MICA_DEBUG == 1
  assert(kv != NULL);
  assert(op != NULL);
  assert(op->opcode == MICA_OP_PUT);
  assert(op->val_len > 0 && op->val_len <= MICA_MAX_VALUE);
  assert(resp != NULL);
#endif

  int i;
  unsigned int bkt = op->key.bkt & kv->bkt_mask;
  struct mica_bkt* bkt_ptr = &kv->ht_index[bkt];
  unsigned int tag = op->key.tag;

#if MICA_DEBUG == 2
  mica_print_op(op);
#endif

  kv->num_insert_op++;

  /* Find a slot to use for this key. If there is a slot with the same
   * tag as ours, we are sure to find it because the used slots are at
   * the beginning of the 8-slot array. */
  int slot_to_use = -1;
  for (i = 0; i < 8; i++) {
    if (bkt_ptr->slots[i].tag == tag || bkt_ptr->slots[i].in_use == 0) {
      slot_to_use = i;
    }
  }

  /* If no slot found, choose one to evict */
  if (slot_to_use == -1) {
    slot_to_use = tag & 7; /* tag is ~ randomly distributed */
    kv->num_index_evictions++;
  }

  /* Encode the empty slot */
  bkt_ptr->slots[slot_to_use].in_use = 1;
  bkt_ptr->slots[slot_to_use].offset = kv->log_head; /* Virtual head */
  bkt_ptr->slots[slot_to_use].tag = tag;

  /* Paste the key-value into the log */
  uint8_t* log_ptr = &kv->ht_log[kv->log_head & kv->log_mask];

  /* Data copied: key, opcode, val_len, value */
  int len_to_copy = sizeof(struct mica_key) + sizeof(uint8_t) +
                    sizeof(uint8_t) + KVS_VALUE_SIZE;  /// op->val_len;

  /* Ensure that we don't wrap around in the *virtual* log space even
   * after 8-byte alignment below.*/
  assert((1ULL << MICA_LOG_BITS) - kv->log_head > len_to_copy + 8);

  memcpy(log_ptr, op, len_to_copy);
  kv->log_head += len_to_copy;

  /* Ensure that the key field of each log entry is 8-byte aligned. This
   * makes subsequent comparisons during GETs faster. */
  kv->log_head = (kv->log_head + 7) & ~7;

  /* If we're close to overflowing in the physical log, wrap around to
   * the beginning, but go forward in the virtual log. */
  if (unlikely(kv->log_cap - kv->log_head <= MICA_MAX_VALUE + 32)) {
    kv->log_head = (kv->log_head + kv->log_cap) & ~kv->log_mask;
    colored_printf(
        RED, "mica-herd-herd: Instance %d wrapping around. Wraps = %llu\n",
        kv->instance_id, kv->log_head / kv->log_cap);
  }
}

/* A fast deterministic way to generate @n ~randomly distributed 16-byte keys */
uint128*
mica_gen_keys(int n)
{
  int i;
  assert(n > 0 && n <= M_1024 / sizeof(uint128));
  assert(sizeof(uint128) == 16);

  // printf("mica-herd-herd: Generating %d keys\n", n);

  uint128* key_arr = malloc(n * sizeof(uint128));
  assert(key_arr != NULL);

  for (i = 0; i < n; i++) {
    key_arr[i] = CityHash128((char*)&i, 4);
  }

  return key_arr;
}


================================================
FILE: src/wings/wings.c
================================================
//
// Created by akatsarakis on 22/01/19.
//

#include "../../include/wings/wings.h"
#include <config.h>
#include <infiniband/verbs.h>
#include <inline-util.h>
#include <spacetime.h>
#include <stdio.h>

// implement a Multicast / Unicast channel
// Support for:
//      mulitcast / unicast channel
//      Coalescing
//      Variable size msgs?
//      Selective Signaling
//      Batching to the NIC
//      Inlining or not
//      Batch post receives to the NIC
//          Mode 1: poll reqs, copy incoming msgs to local buffers and
//          (p)re-post recvs Mode 2: poll reqs, do not copy msgs and post rcvs
//          when said
//      Enable implicit (request - response mode) and explicit (batched) credits
//      flow control

void _wings_setup_send_wr_and_sgl(ud_channel_t* ud_c);
void _wings_setup_recv_wr_and_sgl(ud_channel_t* ud_c,
                                  struct hrd_ud_ctrl_blk* cb);
void _wings_setup_crd_wr_and_sgl(ud_channel_t* ud_c,
                                 struct hrd_ud_ctrl_blk* cb);
void _wings_setup_incoming_buff_and_post_initial_recvs(ud_channel_t* ud_c);
void _wings_ud_channel_init_recv(ud_channel_t* ud_c, struct hrd_ud_ctrl_blk* cb,
                                 uint8_t qp_id,
                                 volatile uint8_t* incoming_reqs_ptr);

void _wings_ud_channel_crd_init(ud_channel_t* ud_c, char* qp_name,
                                ud_channel_t* linked_channel,
                                uint16_t crds_per_channel,
                                uint16_t num_channels, uint8_t channel_id,
                                uint8_t enable_stats, uint8_t enable_prints);

void _wings_print_on_off_toggle(uint16_t bin_flag, char* str);

void _wings_share_qp_info_via_memcached(ud_channel_t** ud_c_array,
                                        uint16_t ud_c_num,
                                        dbit_vector_t* shared_rdy_var,
                                        int worker_lid,
                                        struct hrd_ud_ctrl_blk* cb);

void
wings_ud_channel_destroy(ud_channel_t* ud_c)
{
  free(ud_c->qp_name);
  free(ud_c->recv_wc);
  free(ud_c->remote_qps);
  free(ud_c->credits_per_channels);

  if (ud_c->send_pkt_buff != NULL) free(ud_c->send_pkt_buff);

  if (ud_c->type != CRD && ud_c->max_coalescing > 1)
    free(ud_c->overflow_msg_buff);

  if (ud_c->type == CRD) free(ud_c->no_crds_to_send_per_endpoint);
}

void
wings_ud_channel_init(ud_channel_t* ud_c, char* qp_name, enum channel_type type,
                      uint8_t max_coalescing, uint16_t max_req_size,
                      uint16_t small_req_size, uint8_t enable_inlining,
                      uint8_t is_header_only,
                      // Broadcast
                      uint8_t is_bcast,
                      // Credits
                      uint8_t disable_crd_ctrl, uint8_t expl_crd_ctrl,
                      ud_channel_t* linked_channel, uint16_t crds_per_channel,
                      uint16_t num_channels, uint8_t channel_id,
                      // Toggles
                      uint8_t stats_on, uint8_t prints_on)
{
  assert(type != CRD);         // if CRD type then used the *_crd_init instead
  assert(max_coalescing > 0);  // To disable coalescing use max_coalescing == 1
  assert(channel_id < num_channels);
  assert(!(disable_crd_ctrl == 1 &&
           expl_crd_ctrl == 1));  // cannot disable crd_ctrl and then set an
                                  // explicit credit control
  assert(
      disable_crd_ctrl == 1 ||
      linked_channel !=
          NULL);  // cannot disable crd_ctrl and then set an crd control channel
  assert(is_bcast == 0 || is_header_only == 0);
  assert(small_req_size <= max_req_size);

  _wings_assert_binary(stats_on);
  _wings_assert_binary(is_bcast);
  _wings_assert_binary(prints_on);
  _wings_assert_binary(expl_crd_ctrl);
  _wings_assert_binary(is_header_only);
  _wings_assert_binary(enable_inlining);

  ud_c->is_header_only = is_header_only;
  if (ud_c->is_header_only)
    /// WARNING: hdr_only msgs have an additional 1st B indicating sender_id
    /// (which must not be taken into account on max_req_size)
    assert(max_req_size <= 3 * sizeof(uint8_t) && max_coalescing == 1);

  ud_c->type = type;
  ud_c->channel_id = channel_id;
  ud_c->num_channels = num_channels;  // num_channels include our own channel
  ud_c->expl_crd_ctrl = expl_crd_ctrl;
  ud_c->disable_crd_ctrl = disable_crd_ctrl;
  ud_c->is_bcast_channel = is_bcast;
  ud_c->num_crds_per_channel = crds_per_channel;
  ud_c->channel_providing_crds = linked_channel;

  ud_c->qp_name =
      malloc(sizeof(char) *
             (strlen(qp_name) +
              1));  // TODO make sure to destroy this when destroing a ud_c
  strcpy(ud_c->qp_name, qp_name);

  ud_c->enable_stats = stats_on;
  ud_c->enable_prints = prints_on;

  ud_c->max_coalescing = max_coalescing;
  ud_c->max_msg_size =
      (uint16_t)(max_req_size + (ud_c->is_header_only == 1
                                     ? 1
                                     : 0));  // hdr_only msgs have an additional
                                             // 1st B indicating sender_id
  ud_c->small_msg_size =
      small_req_size == 0 ? ud_c->max_msg_size : small_req_size;

  ud_c->no_crds_to_send_per_endpoint = NULL;  // unused for type != CRD

  uint16_t remote_channels = (uint16_t)(num_channels - 1);
  ud_c->is_inlining_enabled =
      (uint8_t)(ud_c->is_header_only == 1 ? 1 : enable_inlining);
  if (ud_c->is_header_only == 0 &&
      _wings_ud_send_max_pkt_size(ud_c) > WINGS_MAX_SUPPORTED_INLINING) {
    if (ud_c->is_inlining_enabled)
      printf(
          "Unfortunately, inlining for msgs sizes up to (%d) "
          "is higher than the supported (%d)\n",
          _wings_ud_send_max_pkt_size(ud_c), WINGS_MAX_SUPPORTED_INLINING);
    ud_c->is_inlining_enabled = 0;
  }

  ud_c->credits_per_channels = malloc(sizeof(uint16_t) * (num_channels));
  for (int i = 0; i < num_channels; ++i)
    ud_c->credits_per_channels[i] =
        (uint16_t)(type == REQ && !disable_crd_ctrl ? crds_per_channel : 0);

  ud_c->max_pcie_bcast_batch =
      (uint16_t)WINGS_MIN(WINGS_MIN_PCIE_BCAST_BATCH + 1, crds_per_channel);
  // Warning! use min to avoid resetting the first req prior batching to the NIC
  // WARNING: todo check why we need to have MIN_PCIE_BCAST_BATCH + 1 instead of
  // just MIN_PCIE_BCAST_BATCH
  uint16_t max_msgs_in_pcie_bcast =
      (uint16_t)(ud_c->max_pcie_bcast_batch *
                 remote_channels);  // must be smaller than the q_depth

  ud_c->max_recv_wrs = (uint16_t)(crds_per_channel * remote_channels);
  ud_c->max_send_wrs =
      (uint16_t)(ud_c->is_bcast_channel ? max_msgs_in_pcie_bcast
                                        : crds_per_channel * remote_channels);

  ud_c->ss_granularity =
      ud_c->is_bcast_channel ? ud_c->max_pcie_bcast_batch : ud_c->max_send_wrs;

  ud_c->recv_q_depth = ud_c->max_recv_wrs;
  ud_c->send_q_depth =
      (uint16_t)(2 * ud_c->ss_granularity *
                 (ud_c->is_bcast_channel ? remote_channels : 1));

  ud_c->recv_wc = malloc(sizeof(struct ibv_wc) * ud_c->max_recv_wrs);

  ud_c->recv_pkt_buff_len = ud_c->max_recv_wrs;
  ud_c->send_pkt_buff_len =
      (uint16_t)(ud_c->max_send_wrs * (ud_c->is_inlining_enabled ? 1 : 2));

  ud_c->send_pkt_buff =
      ud_c->is_header_only == 1
          ? NULL
          : malloc(_wings_ud_send_max_pkt_size(ud_c) * ud_c->send_pkt_buff_len);

  ud_c->overflow_msg_buff = NULL;
  // Overflow on polling
  if (ud_c->max_coalescing > 1) {
    ud_c->num_overflow_msgs = 0;
    ud_c->enable_overflow_msgs = 1;
    ud_c->overflow_msg_buff =
        malloc((size_t)(ud_c->max_msg_size * (ud_c->max_coalescing - 1)));
  } else {
    ud_c->num_overflow_msgs = 0;
    ud_c->enable_overflow_msgs = 0;
    ud_c->overflow_msg_buff = NULL;
  }

  ud_c->send_push_ptr = 0;
  ud_c->recv_push_ptr = 0;
  ud_c->recv_pull_ptr = -1;

  ud_c->total_pkts_send = 0;

  ud_c->stats.ss_completions = 0;
  ud_c->stats.recv_total_pkts = 0;
  ud_c->stats.recv_total_msgs = 0;
  ud_c->stats.send_total_msgs = 0;
  ud_c->stats.send_total_pkts = 0;
  ud_c->stats.send_total_pcie_batches = 0;
  ud_c->stats.no_stalls_due_to_credits = 0;

  // Initialize the crd channel as well
  if (ud_c->expl_crd_ctrl) {
    char crd_qp_name[1000];
    sprintf(crd_qp_name, "\033[1m\033[36mCRD\033[0m-%s", qp_name);
    _wings_ud_channel_crd_init(linked_channel, crd_qp_name, ud_c,
                               crds_per_channel, num_channels, channel_id,
                               stats_on, prints_on);
  }

  ud_c->remote_qps = malloc(sizeof(qp_info_t) * ud_c->num_channels);

  // The following are set by the *_init_recv function after the creation of
  // control block and QPs
  ud_c->qp = NULL;
  ud_c->pd = NULL;
  ud_c->qp_id = 0;
  ud_c->send_cq = NULL;  // set by init_recv
  ud_c->recv_cq = NULL;  // set by init_recv
  ud_c->recv_pkt_buff = NULL;
  ud_c->send_mem_region = NULL;  // set by init_recv
  //	_wings_setup_send_wr_and_sgl(ud_c);
  //	_wings_setup_recv_wr_and_sgl(ud_c, cb);

  _wings_assert_binary(ud_c->is_header_only);
  assert(ud_c->max_pcie_bcast_batch <= crds_per_channel);
  assert(ud_c->is_header_only == 0 || ud_c->is_header_only);
}

void
wings_setup_channel_qps_and_recvs_w_shm_key(ud_channel_t** ud_c_array,
                                            uint16_t ud_c_num,
                                            dbit_vector_t* shared_rdy_var,
                                            uint16_t worker_lid,
                                            uint16_t base_shm_key)
{
  uint32_t dgram_buff_size = 0;
  int* send_q_depths = malloc(ud_c_num * sizeof(int));
  int* recv_q_depths = malloc(ud_c_num * sizeof(int));

  // Setup Q depths and buff size for incoming pkts
  for (int i = 0; i < ud_c_num; ++i) {
    send_q_depths[i] = ud_c_array[i]->send_q_depth;
    recv_q_depths[i] = ud_c_array[i]->recv_q_depth;
    dgram_buff_size +=
        ud_c_array[i]->type == CRD || ud_c_array[i]->is_header_only == 1
            ? 64
            : _wings_ud_recv_max_pkt_size(ud_c_array[i]) *
                  ud_c_array[i]->recv_q_depth;
  }

  struct hrd_ud_ctrl_blk* cb =
      hrd_ud_ctrl_blk_init(worker_lid, 0,
                           -1,  // local_hid, port_index, numa_node_id,
                           ud_c_num,
                           dgram_buff_size,  // num_dgram_qps, dgram_buf_size
                           base_shm_key + worker_lid,  // key
                           recv_q_depths,
                           send_q_depths);  // Depth of the dgram RECV, SEND Q

  for (uint8_t i = 0; i < ud_c_num; ++i)
    ud_c_array[i]->pd = cb->pd;

  _wings_share_qp_info_via_memcached(ud_c_array, ud_c_num, shared_rdy_var,
                                     worker_lid, cb);

  volatile uint8_t* incoming_reqs_ptr = cb->dgram_buf;
  for (uint8_t i = 0; i < ud_c_num; ++i) {
    // Init recv and setup wrs and sgls of ud_channel
    _wings_ud_channel_init_recv(ud_c_array[i], cb, (uint8_t)i,
                                incoming_reqs_ptr);
    incoming_reqs_ptr +=
        ud_c_array[i]->type == CRD || ud_c_array[i]->is_header_only == 1
            ? 64
            : _wings_ud_recv_max_pkt_size(ud_c_array[i]) *
                  ud_c_array[i]->recv_q_depth;
  }

  free(send_q_depths);
  free(recv_q_depths);

  for (int i = 0; i < ud_c_num; ++i)
    if (ud_c_array[i]->type != CRD) _wings_assertions(ud_c_array[i]);

  sleep(1);  /// Give some leeway to post receives, before start bcasting!
}

void
wings_setup_channel_qps_and_recvs(ud_channel_t** ud_c_array, uint16_t ud_c_num,
                                  dbit_vector_t* shared_rdy_var,
                                  uint16_t worker_lid)
{
  wings_setup_channel_qps_and_recvs_w_shm_key(
      ud_c_array, ud_c_num, shared_rdy_var, worker_lid, BASE_SHM_KEY);
}

void
wings_print_ud_c_overview(ud_channel_t* ud_c)
{
  printf("%s Channel[%d] %s(%d) --> %s\n",
         ud_c->is_bcast_channel ? "Bcast" : "Unicast", ud_c->channel_id,
         ud_c->qp_name, ud_c->qp_id, ud_c->type == REQ ? "REQ" : "RESP");

  _wings_print_on_off_toggle(ud_c->is_inlining_enabled, "Inlining");
  _wings_print_on_off_toggle(ud_c->max_coalescing, "Coalescing");
  _wings_print_on_off_toggle(ud_c->max_pcie_bcast_batch, "Max PCIe batch");

  printf("\t\tMax msg size: %dB\n", ud_c->max_msg_size);
  if (ud_c->type != CRD && !ud_c->is_header_only)
    printf("\t\tMax pkt size: send = %dB, recv = %dB\n",
           _wings_ud_send_max_pkt_size(ud_c),
           _wings_ud_recv_max_pkt_size(ud_c));
  else
    printf(
        "\t\tMax pkt size: send = 4B (inlined_payload), recv = "
        "4B(inlined_payload)\n");
  printf("\t\tSS granularity: %d\n", ud_c->ss_granularity);

  printf("\t\tNum remotes: %d\n", ud_c->num_channels - 1);
  if (ud_c->disable_crd_ctrl)
    printf("\t\tCredits: OFF \n");
  else
    printf("\t\tCredits: %d (%s) --> %s (%d)\n", ud_c->num_crds_per_channel,
           ud_c->expl_crd_ctrl ? "Explicit" : "Implicit",
           ud_c->channel_providing_crds->qp_name,
           ud_c->channel_providing_crds->qp_id);

  printf("\t\tSend Q len: %d\n", ud_c->send_q_depth);
  printf("\t\tRecv Q len: %d\n", ud_c->recv_q_depth);

  printf("\t\tSend wr len: %d\n", ud_c->max_send_wrs);
  printf("\t\tRecv wr len: %d\n", ud_c->max_recv_wrs);

  printf("\t\tSend pkt len: %d\n", ud_c->send_pkt_buff_len);
  printf("\t\tRecv pkt len: %d\n", ud_c->recv_pkt_buff_len);

  _wings_print_on_off_toggle(ud_c->enable_stats, "Stats");
  _wings_print_on_off_toggle(ud_c->enable_prints, "Prints");
}

/* ---------------------------------------------------------------------------
----------------------------------- SETUPs ------------------------------------
---------------------------------------------------------------------------*/
void
_wings_print_on_off_toggle(uint16_t bin_flag, char* str)
{
  if (bin_flag > 1)
    printf("\t\t%s : %s (%d)\n", str, "\033[1m\033[32mOn\033[0m", bin_flag);
  else
    printf("\t\t%s : %s\n", str,
           bin_flag ? "\033[1m\033[32mOn\033[0m" : "\033[31mOff\033[0m");
}

void
_wings_ud_channel_crd_init(ud_channel_t* ud_c, char* qp_name,
                           ud_channel_t* linked_channel,
                           uint16_t crds_per_channel, uint16_t num_channels,
                           uint8_t channel_id, uint8_t enable_stats,
                           uint8_t enable_prints)
{
  assert(channel_id < num_channels);

  _wings_assert_binary(enable_stats);
  _wings_assert_binary(enable_prints);

  ud_c->type = CRD;
  ud_c->qp_name =
      malloc(sizeof(char) *
             (strlen(qp_name) +
              1));  // TODO make sure to destroy this when destroing a crd_ud_c
  strcpy(ud_c->qp_name, qp_name);

  ud_c->channel_id = channel_id;
  ud_c->num_channels = num_channels;  // num_channels include our own channel
  ud_c->expl_crd_ctrl = 1;
  ud_c->disable_crd_ctrl = 0;
  ud_c->is_bcast_channel = 0;
  ud_c->max_pcie_bcast_batch = 0;
  ud_c->num_crds_per_channel = crds_per_channel;
  ud_c->channel_providing_crds = linked_channel;

  ud_c->enable_stats = enable_stats;
  ud_c->enable_prints = enable_prints;

  static_assert(sizeof(wings_crd_t) <= 4,
                "");         // Credits are always send as inlined_payload <=4B
  ud_c->max_msg_size = 0;    // non inlined_payload size
  ud_c->small_msg_size = 0;  // non inlined_payload size
  ud_c->max_coalescing = 1;

  ud_c->no_crds_to_send_per_endpoint = malloc(sizeof(uint16_t) * num_channels);

  uint16_t remote_channels = (uint16_t)(num_channels - 1);
  ud_c->is_inlining_enabled = 1;

  ud_c->credits_per_channels = malloc(sizeof(uint16_t) * (num_channels));
  for (int i = 0; i < num_channels; ++i)
    ud_c->credits_per_channels[i] = 0;

  ud_c->max_recv_wrs = crds_per_channel * remote_channels;
  ud_c->max_send_wrs = crds_per_channel * remote_channels;  // TODO correct this

  ud_c->ss_granularity = ud_c->max_send_wrs;

  ud_c->recv_q_depth = ud_c->max_recv_wrs;
  ud_c->send_q_depth = (uint16_t)(2 * ud_c->ss_granularity);

  ud_c->recv_wc = malloc(sizeof(struct ibv_wc) * ud_c->max_recv_wrs);

  ud_c->recv_pkt_buff_len =
      ud_c->max_recv_wrs * ud_c->max_coalescing;  // TODO: is this correct?
  ud_c->send_pkt_buff_len = ud_c->max_send_wrs;

  ud_c->send_pkt_buff = NULL;  // malloc(_wings_ud_send_max_pkt_size(ud_c) *
                               // ud_c->send_pkt_buff_len);

  ud_c->send_mem_region = NULL;

  ud_c->send_push_ptr = 0;
  ud_c->recv_push_ptr = 0;
  ud_c->recv_pull_ptr = -1;

  ud_c->total_pkts_send = 0;

  ud_c->stats.ss_completions = 0;
  ud_c->stats.recv_total_pkts = 0;
  ud_c->stats.recv_total_msgs = 0;
  ud_c->stats.send_total_msgs = 0;
  ud_c->stats.send_total_pkts = 0;
  ud_c->stats.send_total_pcie_batches = 0;
  ud_c->stats.no_stalls_due_to_credits = 0;

  ud_c->remote_qps = malloc(sizeof(qp_info_t) * ud_c->num_channels);
  // The following are set by the *_init_recv function after the creation of
  // control block and QPs
  ud_c->qp = NULL;
  ud_c->pd = NULL;
  ud_c->qp_id = 0;
  ud_c->send_cq = NULL;
  ud_c->recv_cq = NULL;
  ud_c->recv_pkt_buff = NULL;
  //	_wings_setup_crd_wr_and_sgl(ud_c, cb);
}

void
_wings_ud_channel_init_recv(ud_channel_t* ud_c, struct hrd_ud_ctrl_blk* cb,
                            uint8_t qp_id, volatile uint8_t* incoming_reqs_ptr)
{
  ud_c->qp_id = qp_id;
  ud_c->qp = cb->dgram_qp[qp_id];

  ud_c->recv_pkt_buff = incoming_reqs_ptr;

  ud_c->send_cq = cb->dgram_send_cq[ud_c->qp_id];
  ud_c->recv_cq = cb->dgram_recv_cq[ud_c->qp_id];

  if (ud_c->type != CRD) {
    ud_c->send_mem_region =
        ud_c->is_inlining_enabled
            ? NULL
            : register_buffer(
                  cb->pd, ud_c->send_pkt_buff,
                  _wings_ud_send_max_pkt_size(ud_c) * ud_c->send_pkt_buff_len);
    _wings_setup_send_wr_and_sgl(ud_c);
    _wings_setup_recv_wr_and_sgl(ud_c, cb);
  } else
    _wings_setup_crd_wr_and_sgl(ud_c, cb);

  // post initial receivs
  /// WARNING try to avoid races of posting initial receives and sending msgs
  _wings_setup_incoming_buff_and_post_initial_recvs(ud_c);
}

void
_wings_setup_crd_wr_and_sgl(ud_channel_t* ud_c, struct hrd_ud_ctrl_blk* cb)
{
  assert(ud_c->type == CRD);

  // Credit Send WRs / sgl
  wings_crd_t crd_tmp;
  crd_tmp.crd_num = 0;
  crd_tmp.sender_id = (uint8_t)ud_c->channel_id;

  ud_c->send_sgl = malloc(sizeof(struct ibv_sge));
  ud_c->send_sgl->length = 0;

  ud_c->send_wr = malloc(sizeof(struct ibv_send_wr) * ud_c->max_send_wrs);
  for (int i = 0; i < ud_c->max_send_wrs; ++i) {
    ud_c->send_wr[i].opcode = IBV_WR_SEND_WITH_IMM;
    ud_c->send_wr[i].num_sge = 0;
    ud_c->send_wr[i].sg_list = ud_c->send_sgl;
    ud_c->send_wr[i].wr.ud.remote_qkey = HRD_DEFAULT_QKEY;
    ud_c->send_wr[i].next = NULL;
    ud_c->send_wr[i].send_flags = IBV_SEND_INLINE;
    ud_c->send_wr[i].imm_data = 0;
    memcpy(&ud_c->send_wr[i].imm_data, &crd_tmp, sizeof(wings_crd_t));
  }

  // Credit Recv WRs / sgl
  ud_c->recv_sgl = malloc(sizeof(struct ibv_sge));
  ud_c->recv_sgl->length = 64;  // TODO can we make this zero?
  ud_c->recv_sgl->lkey = cb->dgram_buf_mr->lkey;
  ud_c->recv_sgl->addr = (uint64_t)ud_c->recv_pkt_buff;

  ud_c->recv_wr = malloc(sizeof(struct ibv_recv_wr) * ud_c->max_recv_wrs);
  for (int i = 0; i < ud_c->max_recv_wrs; ++i) {
    ud_c->recv_wr[i].num_sge = 1;
    ud_c->recv_wr[i].sg_list = ud_c->recv_sgl;
  }
}

void
_wings_setup_send_wr_and_sgl(ud_channel_t* ud_c)
{
  assert(ud_c->type != CRD);

  wings_hdr_only_t hdr_only_tmp;
  hdr_only_tmp.sender_id = (uint8_t)ud_c->channel_id;
  memset(hdr_only_tmp.inlined_payload, 0, 3 * sizeof(uint8_t));

  if (ud_c->is_bcast_channel) {  // Send bcast WRs

    uint16_t remote_channels = (uint16_t)(ud_c->num_channels - 1);
    uint16_t max_msgs_in_pcie_batch =
        (uint16_t)(ud_c->max_pcie_bcast_batch * remote_channels);
    ud_c->send_wr = malloc(sizeof(struct ibv_send_wr) * max_msgs_in_pcie_batch);
    ud_c->send_sgl =
        malloc(sizeof(struct ibv_sge) *
               (ud_c->is_header_only == 1 ? 1 : ud_c->max_pcie_bcast_batch));

    if (ud_c->is_header_only)
      ud_c->send_sgl->length = 0;
    else
      for (int i = 0; i < ud_c->max_pcie_bcast_batch; ++i)
        ud_c->send_sgl[i].length = _wings_ud_send_max_pkt_size(ud_c);

    for (int i = 0; i < max_msgs_in_pcie_batch; ++i) {
      int sgl_index = i / remote_channels;
      int i_mod_bcast = i % remote_channels;

      uint16_t rm_qp_id;
      if (i_mod_bcast < ud_c->channel_id)
        rm_qp_id = (uint16_t)i_mod_bcast;
      else
        rm_qp_id = (uint16_t)((i_mod_bcast + 1) % ud_c->num_channels);

      ud_c->send_wr[i].wr.ud.remote_qkey = HRD_DEFAULT_QKEY;
      ud_c->send_wr[i].wr.ud.ah = ud_c->remote_qps[rm_qp_id].ah;
      ud_c->send_wr[i].wr.ud.remote_qpn = ud_c->remote_qps[rm_qp_id].qpn;

      if (!ud_c->is_header_only) {
        ud_c->send_wr[i].num_sge = 1;
        ud_c->send_wr[i].opcode =
            IBV_WR_SEND;  /// Attention!! there is no immediate here
        ud_c->send_wr[i].sg_list = &ud_c->send_sgl[sgl_index];

      } else {
        ud_c->send_wr[i].next = NULL;
        ud_c->send_wr[i].imm_data = 0;
        ud_c->send_wr[i].num_sge = 0;
        ud_c->send_wr[i].sg_list = ud_c->send_sgl;
        ud_c->send_wr[i].opcode = IBV_WR_SEND_WITH_IMM;
        memcpy(&ud_c->send_wr[i].imm_data, &hdr_only_tmp,
               sizeof(wings_hdr_only_t));
      }

      if (!ud_c->is_inlining_enabled) {
        ud_c->send_wr[i].send_flags = 0;
        ud_c->send_sgl[sgl_index].lkey = ud_c->send_mem_region->lkey;
      } else
        ud_c->send_wr[i].send_flags = IBV_SEND_INLINE;

      ud_c->send_wr[i].next =
          (i_mod_bcast == remote_channels - 1) ? NULL : &ud_c->send_wr[i + 1];
    }

  } else {  // Send unicast WRs

    ud_c->send_wr = malloc(sizeof(struct ibv_send_wr) * ud_c->max_send_wrs);
    ud_c->send_sgl = malloc(sizeof(struct ibv_sge) *
                            (ud_c->is_header_only ? 1 : ud_c->max_send_wrs));
    for (int i = 0; i < ud_c->max_send_wrs; ++i) {
      ud_c->send_wr[i].wr.ud.remote_qkey = HRD_DEFAULT_QKEY;

      if (!ud_c->is_header_only) {
        //				ud_c->send_sgl[i].length =
        // sizeof(wings_pkt_t) + _wings_ud_recv_max_pkt_size(ud_c);
        ud_c->send_sgl[i].length = _wings_ud_send_max_pkt_size(ud_c);
        ud_c->send_wr[i].num_sge = 1;
        ud_c->send_wr[i].opcode =
            IBV_WR_SEND;  /// Attention!! there is no immediate here
        ud_c->send_wr[i].sg_list = &ud_c->send_sgl[i];

      } else {
        ud_c->send_sgl->length = 0;
        ud_c->send_wr[i].next = NULL;
        ud_c->send_wr[i].imm_data = 0;
        ud_c->send_wr[i].num_sge = 0;
        ud_c->send_wr[i].sg_list = ud_c->send_sgl;
        ud_c->send_wr[i].opcode = IBV_WR_SEND_WITH_IMM;
        memcpy(&ud_c->send_wr[i].imm_data, &hdr_only_tmp,
               sizeof(wings_hdr_only_t));
      }

      if (!ud_c->is_inlining_enabled) {
        ud_c->send_wr[i].send_flags = 0;
        ud_c->send_sgl[i].lkey = ud_c->send_mem_region->lkey;
      } else
        ud_c->send_wr[i].send_flags = IBV_SEND_INLINE;
    }
  }
}

void
_wings_setup_recv_wr_and_sgl(ud_channel_t* ud_c, struct hrd_ud_ctrl_blk* cb)
{
  assert(ud_c->type != CRD);

  ud_c->recv_sgl = malloc(sizeof(struct ibv_sge) *
                          (ud_c->is_header_only == 1 ? 1 : ud_c->max_recv_wrs));

  if (ud_c->is_header_only) {
    ud_c->recv_sgl->length = 64;  // TODO can we make this zero?
    ud_c->recv_sgl->lkey = cb->dgram_buf_mr->lkey;
    ud_c->recv_sgl->addr = (uint64_t)ud_c->recv_pkt_buff;
  }

  ud_c->recv_wr = malloc(sizeof(struct ibv_recv_wr) * ud_c->max_recv_wrs);
  for (int i = 0; i < ud_c->max_recv_wrs; i++) {
    if (!ud_c->is_header_only) {
      ud_c->recv_sgl[i].lkey = cb->dgram_buf_mr->lkey;
      ud_c->recv_sgl[i].length = _wings_ud_recv_max_pkt_size(ud_c);
    }

    ud_c->recv_wr[i].num_sge = 1;
    ud_c->recv_wr[i].next =
        (i == ud_c->max_recv_wrs - 1) ? NULL : &ud_c->recv_wr[i + 1];
    ud_c->recv_wr[i].sg_list =
        ud_c->is_header_only == 1 ? ud_c->recv_sgl : &ud_c->recv_sgl[i];
  }
}

void
_wings_setup_incoming_buff_and_post_initial_recvs(ud_channel_t* ud_c)
{
  if (ud_c->is_header_only == 0 && ud_c->type != CRD) {
    // init recv buffs as empty (not need for CRD since CRD msgs are
    // --inlined_payload-- header-only)
    for (uint16_t i = 0; i < ud_c->send_pkt_buff_len; ++i)
      _wings_get_nth_pkt_ptr_from_send_buff(ud_c, i)->req_num = 0;
    for (uint16_t i = 0; i < ud_c->recv_pkt_buff_len; ++i)
      _wings_get_nth_pkt_ptr_from_recv_buff(ud_c, i)->pkt.req_num = 0;
  }

  if (WINGS_ENABLE_POST_RECV_PRINTS && ud_c->enable_prints)
    colored_printf(YELLOW, "vvv Post Initial Receives: %s %d\n", ud_c->qp_name,
                   ud_c->max_recv_wrs);

  if (ud_c->is_header_only == 0 && ud_c->type != CRD)
    _wings_post_recvs(ud_c, ud_c->max_recv_wrs);
  else
    _wings_post_hdr_only_recvs(ud_c, ud_c->max_recv_wrs);
}

/* ---------------------------------------------------------------------------
   -------------------------------- QP Sharing -------------------------------
   ---------------------------------------------------------------------------
 */
unsigned long
_wings_simple_hash(unsigned char* str)
{
  int c;
  unsigned long hash = 5381;

  while (c = *str++)
    hash = ((hash << 5) + hash) + c;  // hash * 33 + c
  return hash;
}

void
_wings_get_remote_qp(ud_channel_t* ud_c, uint8_t endpoint_id)
{
  int ib_port_index = 0;
  int local_port_i = ib_port_index;
  char qp_global_name[HRD_QP_NAME_SIZE];
  struct hrd_qp_attr*
      qp;  //= malloc(sizeof(struct hrd_qp_attr*) * max_remote_channels);
  sprintf(qp_global_name, "%lu-%d",
          _wings_simple_hash((unsigned char*)ud_c->qp_name), endpoint_id);
  // Get the UD queue pair for the ith machine
  qp = NULL;
  //	yellow_printf("Looking for %s\n", qp_global_name);
  while (qp == NULL) {
    qp = hrd_get_published_qp(qp_global_name);

    if (qp == NULL) usleep(200000);
  }
  //	green_printf("Found %s\n", qp_global_name);

  struct ibv_ah_attr ah_attr = {
      //-----INFINIBAND----------
      .is_global = 0,
      .dlid = (uint16_t)qp->lid,
      .sl = (uint8_t)qp->sl,
      .src_path_bits = 0,
      /* port_num (> 1): device-local port for responses to this worker */
      .port_num = (uint8_t)(local_port_i + 1),
  };

  if (is_roce == 1) {
    //-----RoCE----------
    ah_attr.is_global = 1;
    ah_attr.dlid = 0;
    ah_attr.grh.dgid.global.interface_id = qp->gid_global_interface_id;
    ah_attr.grh.dgid.global.subnet_prefix = qp->gid_global_subnet_prefix;
    ah_attr.grh.sgid_index = 0;
    ah_attr.grh.hop_limit = 1;
  }

  ud_c->remote_qps[endpoint_id].qpn = (uint32_t)qp->qpn;
  ud_c->remote_qps[endpoint_id].ah = ibv_create_ah(ud_c->pd, &ah_attr);
  assert(ud_c->remote_qps[endpoint_id].ah != NULL);
}

void
_wings_get_remote_qps(ud_channel_t** ud_c_array, uint16_t ud_c_num)
{
  uint16_t max_remote_channels = 0;
  for (int i = 0; i < ud_c_num; ++i)
    if (ud_c_array[i]->num_channels > max_remote_channels)
      max_remote_channels = ud_c_array[i]->num_channels;

  for (int i = 0; i < ud_c_num; ++i)
    for (int j = 0; j < ud_c_array[i]->num_channels; ++j) {
      if (j == ud_c_array[i]->channel_id)
        continue;  // skip the local channel id
      _wings_get_remote_qp(ud_c_array[i], (uint8_t)j);
    }
}

void
_wings_share_qp_info_via_memcached(ud_channel_t** ud_c_array, uint16_t ud_c_num,
                                   dbit_vector_t* shared_rdy_var,
                                   int worker_lid, struct hrd_ud_ctrl_blk* cb)
{
  for (int i = 0; i < ud_c_num; i++) {
    char qp_global_name[HRD_QP_NAME_SIZE];
    sprintf(qp_global_name, "%lu-%d",
            _wings_simple_hash((unsigned char*)ud_c_array[i]->qp_name),
            ud_c_array[i]->channel_id);
    hrd_publish_dgram_qp(cb, i, qp_global_name, WORKER_SL);
    //		yellow_printf("Publishing: %s (qpname: %s)\n",  qp_global_name,
    // ud_c_array[i]->qp_name);
  }

  _wings_get_remote_qps(ud_c_array, ud_c_num);
  if (shared_rdy_var == NULL) {
    assert(worker_lid == 0);
    return;
  }

  assert(dbv_bit_get(*shared_rdy_var, worker_lid) == 0);
  dbv_bit_set(shared_rdy_var, (uint8_t)worker_lid);

  // WARNING (global) shared_rdy_var which is used as a g_share_qs_barrier must
  // be len of num_workers + 1
  while (!dbv_is_all_set(*shared_rdy_var))
    usleep(20000);

  assert(dbv_is_all_set(*shared_rdy_var));
}

void
wings_reconfigure_wrs_ah(ud_channel_t* ud_c, uint8_t endpoint_id)
{
  _wings_get_remote_qp(ud_c, endpoint_id);
  if (!ud_c->disable_crd_ctrl)
    _wings_get_remote_qp(ud_c->channel_providing_crds, endpoint_id);

  /// TODO WARNING: this is untested and assumes that we always send to everyone
  if (ud_c->is_bcast_channel) {
    uint16_t remote_channels = (uint16_t)(ud_c->num_channels - 1);
    uint16_t max_msgs_in_pcie_batch =
        (uint16_t)(ud_c->max_pcie_bcast_batch * remote_channels);
    for (int i = 0; i < max_msgs_in_pcie_batch; ++i) {
      int i_mod_bcast = i % remote_channels;

      uint16_t rm_qp_id;
      if (i_mod_bcast < ud_c->channel_id)
        rm_qp_id = (uint16_t)i_mod_bcast;
      else
        rm_qp_id = (uint16_t)((i_mod_bcast + 1) % ud_c->num_channels);

      ud_c->send_wr[i].wr.ud.ah = ud_c->remote_qps[rm_qp_id].ah;
      ud_c->send_wr[i].wr.ud.remote_qpn = ud_c->remote_qps[rm_qp_id].qpn;
    }
  }
}


================================================
FILE: tla/Hermes.tla
================================================
------------------------------- MODULE Hermes -------------------------------
EXTENDS     Integers,
            FiniteSets

CONSTANTS   H_NODES,
            H_MAX_VERSION
            
VARIABLES   msgs,
            nodeTS,
            nodeState, 
            nodeRcvedAcks,
            nodeLastWriter,
            nodeLastWriteTS,
            nodeWriteEpochID, 
            aliveNodes,
            epochID 
            
\* all Hermes (+ environment) variables
hvars == << msgs, nodeTS, nodeState, nodeRcvedAcks, nodeLastWriter, 
            nodeLastWriteTS, nodeWriteEpochID, aliveNodes, epochID >>

-------------------------------------------------------------------------------------
HMessage ==  \* Messages exchanged by the Protocol   
    [type: {"INV", "ACK"}, sender    : H_NODES,
                           epochID   : 0..(Cardinality(H_NODES) - 1),
                           version   : 0..H_MAX_VERSION,  
                           tieBreaker: H_NODES] 
    \* Note that we need not send Value w/ INVs, timestamp suffice to check consistency
        \union

    [type: {"VAL"},        \* optimization: epochID is not required for VALs
                           \* epochID   : 0..(Cardinality(H_NODES) - 1),
                           version   : 0..H_MAX_VERSION, 
                           tieBreaker: H_NODES] 

HTypeOK ==  \* The type correctness invariant
    /\  msgs            \subseteq HMessage
    /\ \A n \in H_NODES: nodeRcvedAcks[n] \subseteq (H_NODES \ {n})
    /\  nodeLastWriter  \in [H_NODES -> H_NODES]
    /\  nodeLastWriteTS \in [H_NODES -> [version   : 0..H_MAX_VERSION,
                                       tieBreaker: H_NODES         ]]
    /\  nodeTS          \in [H_NODES -> [version   : 0..H_MAX_VERSION,
                                       tieBreaker: H_NODES         ]]
    /\  nodeState       \in [H_NODES -> {"valid", "invalid", "invalid_write", 
                                         "write", "replay"}]
    \*  membership and epoch id related
    /\  aliveNodes      \subseteq H_NODES
    /\  epochID         \in 0..(Cardinality(H_NODES) - 1)
    /\  nodeWriteEpochID \in [H_NODES -> 0..(Cardinality(H_NODES) - 1)]
                                              

\* The consistent invariant: all alive nodes in valid state should have the same value / TS         
HConsistent == 
    \A k,s \in aliveNodes:  \/ nodeState[k] /= "valid"
                            \/ nodeState[s] /= "valid" 
                            \/ nodeTS[k] = nodeTS[s]
                                              
HInit == \* The initial predicate
    /\  msgs            = {}
    \*  membership and epoch id related
    /\  epochID         = 0
    /\  aliveNodes      = H_NODES
    /\  nodeWriteEpochID = [n \in H_NODES |-> 0]
    \*  Init rest per node replica metadata
    /\  nodeRcvedAcks   = [n \in H_NODES |-> {}]
    /\  nodeState       = [n \in H_NODES |-> "valid"]
    /\  nodeLastWriter  = [n \in H_NODES |-> CHOOSE k \in H_NODES:
                                             \A m \in H_NODES: k <= m]
    /\  nodeTS          = [n \in H_NODES |-> [version |-> 0, 
                                              tieBreaker |-> 
                                              CHOOSE k \in H_NODES: 
                                               \A m \in H_NODES: k <= m]]
    /\  nodeLastWriteTS = [n \in H_NODES |-> [version |-> 0, 
                                              tieBreaker |-> 
                                              CHOOSE k \in H_NODES: 
                                               \A m \in H_NODES: k <= m]]
                                               
-------------------------------------------------------------------------------------

\* A buffer maintaining all network messages. Messages are only appended to this variable (not 
\* removed once delivered) intentionally to check protocols tolerance in dublicates and reorderings 
send(m) == msgs' = msgs \union {m}

\* Check if all acknowledgments for a write have been received                                                  
receivedAllAcks(n) == (aliveNodes \ {n}) \subseteq nodeRcvedAcks[n]
        
equalTS(v1,tb1,v2,tb2) ==  \* Timestamp equality
    /\ v1 = v2
    /\ tb1 = tb2

greaterTS(v1,tb1,v2,tb2) == \* Timestamp comparison
    \/ v1 > v2
    \/ /\   v1 = v2
       /\  tb1 > tb2
       
isAlive(n) == n \in aliveNodes
                   
nodeFailure(n) == \* Emulate a node failure
\*    Make sure that there are atleast 3 alive nodes before killing a node
    /\ Cardinality(aliveNodes) > 2
    /\ nodeRcvedAcks' = [k \in H_NODES |-> {}]
    /\ aliveNodes'    = aliveNodes \ {n}
    /\ epochID'       = epochID + 1
    /\ UNCHANGED <<msgs, nodeState, nodeTS, nodeLastWriter, 
                   nodeLastWriteTS, nodeWriteEpochID>>

h_upd_not_aliveNodes ==
    /\  UNCHANGED <<aliveNodes, epochID, nodeWriteEpochID>>
    
    
h_upd_aliveNodes ==
    /\ UNCHANGED <<msgs, nodeState, nodeTS, nodeLastWriter, nodeLastWriteTS, nodeRcvedAcks>>
                   
h_upd_nothing ==                    
    /\ h_upd_not_aliveNodes
    /\ h_upd_aliveNodes
    
-------------------------------------------------------------------------------------

h_upd_state(n, newVersion, newTieBreaker, newState, newAcks) == 
    /\  nodeLastWriter'   = [nodeLastWriter  EXCEPT ![n] = n]
    /\  nodeRcvedAcks'    = [nodeRcvedAcks   EXCEPT ![n] = newAcks]
    /\  nodeState'        = [nodeState       EXCEPT ![n] = newState]
    /\  nodeWriteEpochID' = [nodeWriteEpochID EXCEPT ![n] = epochID] \* we always use the latest epochID
    /\  nodeTS'           = [nodeTS          EXCEPT ![n].version    = newVersion, 
                                                    ![n].tieBreaker = newTieBreaker]
    /\  nodeLastWriteTS'  = [nodeLastWriteTS EXCEPT ![n].version    = newVersion, 
                                                    ![n].tieBreaker = newTieBreaker]
                                            
h_send_inv_or_ack(n, newVersion, newTieBreaker, msgType) ==  
    /\  send([type        |-> msgType,
              epochID     |-> epochID, \* we always use the latest epochID
              sender      |-> n,
              version     |-> newVersion, 
              tieBreaker  |-> newTieBreaker])              

h_actions_for_upd(n, newVersion, newTieBreaker, newState, newAcks) == \* Execute a write
    /\  h_upd_state(n, newVersion, newTieBreaker, newState, newAcks)
    /\  h_send_inv_or_ack(n, newVersion, newTieBreaker, "INV")
    /\  UNCHANGED <<aliveNodes, epochID>>
 

h_actions_for_upd_replay(n, acks) == \* Apply a write-replay using same TS (version, tie-breaker) 
                                     \* and either reset acks or keep already gathered acks
    /\  h_actions_for_upd(n, nodeTS[n].version, nodeTS[n].tieBreaker, "replay", acks)

-------------------------------------------------------------------------------------

HRead(n) ==  \* Execute a read
    /\ nodeState[n] = "valid"
    /\ h_upd_nothing
              
HWrite(n) == \* Execute a write
\*    /\  nodeState[n]      \in {"valid", "invalid"} 
    \* writes in invalid state are also supported as an optimization
    /\  nodeState[n]      \in {"valid"}
    /\  nodeTS[n].version < H_MAX_VERSION \* Only to configurably terminate the model checking 
    /\  h_actions_for_upd(n, nodeTS[n].version + 1, n, "write", {})


HCoordWriteReplay(n) == \* Execute a write-replay after a membership re-config
    /\  nodeState[n] \in {"write", "replay"}
    /\  nodeWriteEpochID[n] < epochID
    /\  ~receivedAllAcks(n) \* optimization to not replay when we have gathered acks from all alive
    /\  h_actions_for_upd_replay(n, nodeRcvedAcks[n])


HRcvAck(n) ==   \* Process a received acknowledment
    \E m \in msgs: 
        /\ m.type     = "ACK"
        /\ m.epochID  = epochID
        /\ m.sender  /= n
        /\ m.sender  \notin nodeRcvedAcks[n]
        /\ equalTS(m.version, m.tieBreaker,
                   nodeLastWriteTS[n].version, 
                   nodeLastWriteTS[n].tieBreaker)
        /\ nodeState[n] \in {"write", "invalid_write", "replay"}
        /\ nodeRcvedAcks' = [nodeRcvedAcks EXCEPT ![n] = 
                                              nodeRcvedAcks[n] \union {m.sender}]
        /\ UNCHANGED <<msgs, nodeLastWriter, nodeLastWriteTS, 
                       aliveNodes, nodeTS, nodeState, epochID, nodeWriteEpochID>>


HSendVals(n) == \* Send validations once acknowledments are received from all alive nodes
    /\ nodeState[n] \in {"write", "replay"}
    /\ receivedAllAcks(n)
    /\ nodeState'         = [nodeState EXCEPT![n] = "valid"]
    /\ send([type        |-> "VAL", 
             version     |-> nodeTS[n].version, 
             tieBreaker  |-> nodeTS[n].tieBreaker])
    /\ UNCHANGED <<nodeTS, nodeLastWriter, nodeLastWriteTS,
                   aliveNodes, nodeRcvedAcks, epochID, nodeWriteEpochID>>
 
HCoordinatorActions(n) ==   \* Actions of a read/write coordinator 
    \/ HRead(n)          
    \/ HCoordWriteReplay(n) \* After failures
    \/ HWrite(n)         
    \/ HRcvAck(n)
    \/ HSendVals(n) 

-------------------------------------------------------------------------------------               
    
HRcvInv(n) ==  \* Process a received invalidation
    \E m \in msgs: 
        /\ m.type     = "INV"
        /\ m.epochID  = epochID
        /\ m.sender  /= n
        \* always acknowledge a received invalidation (irrelevant to the timestamp)
        /\ send([type       |-> "ACK",
                 sender     |-> n,   
                 epochID    |-> epochID,
                 version    |-> m.version,
                 tieBreaker |-> m.tieBreaker])
        /\ IF greaterTS(m.version, m.tieBreaker,
                        nodeTS[n].version, nodeTS[n].tieBreaker)
           THEN   /\ nodeLastWriter' = [nodeLastWriter EXCEPT ![n] = m.sender]
                  /\ nodeTS' = [nodeTS EXCEPT ![n].version    = m.version,
                                          ![n].tieBreaker = m.tieBreaker]
                  /\ IF nodeState[n] \in {"valid", "invalid", "replay"}
                     THEN 
                        nodeState' = [nodeState EXCEPT ![n] = "invalid"]
                     ELSE 
                        nodeState' = [nodeState EXCEPT ![n] = "invalid_write"] 
           ELSE
                  UNCHANGED <<nodeState, nodeTS, nodeLastWriter, nodeWriteEpochID>>
        /\ UNCHANGED <<nodeLastWriteTS, aliveNodes, nodeRcvedAcks, epochID, nodeWriteEpochID>>
     
            
HRcvVal(n) ==   \* Process a received validation
    \E m \in msgs: 
        /\ nodeState[n] /= "valid"
        /\ m.type = "VAL"
        /\ equalTS(m.version, m.tieBreaker,
                   nodeTS[n].version, 
                   nodeTS[n].tieBreaker)
        /\ nodeState' = [nodeState EXCEPT ![n] = "valid"]
        /\ UNCHANGED <<msgs, nodeTS, nodeLastWriter, nodeLastWriteTS,
                       aliveNodes, nodeRcvedAcks, epochID, nodeWriteEpochID>>
   
HFollowerWriteReplay(n) == \* Execute a write-replay when coordinator failed
    /\  nodeState[n] \in {"invalid", "invalid_write"}
    /\  ~isAlive(nodeLastWriter[n])
    /\  h_actions_for_upd_replay(n, {}) 

   
HFollowerActions(n) ==  \* Actions of a write follower
    \/ HRcvInv(n)
    \/ HFollowerWriteReplay(n)
    \/ HRcvVal(n) 
 
------------------------------------------------------------------------------------- 

HNext == \* Hermes (read/write) protocol (Coordinator and Follower actions) + failures
    \E n \in aliveNodes:       
            \/ HFollowerActions(n)
            \/ HCoordinatorActions(n)
            \/ nodeFailure(n) 


H_Spec == HInit /\ [][HNext]_hvars


THEOREM H_Spec =>([]HTypeOK) /\ ([]HConsistent)

=============================================================================


================================================
FILE: tla/HermesRMWs.tla
================================================
------------------------------- MODULE HermesRMWs -------------------------------
EXTENDS     Hermes
            
VARIABLES   Rmsgs,
            nodeFlagRMW,
            committedRMWs,
            committedWrites
                                 
\* all Hermes (+ environment, + RMW) variables
hrvars == << msgs, nodeTS, nodeState, nodeRcvedAcks, nodeLastWriter, 
             nodeLastWriteTS, nodeWriteEpochID, aliveNodes, epochID,
             Rmsgs, nodeFlagRMW, committedRMWs, committedWrites >>
-------------------------------------------------------------------------------------
HRMessage ==  \* Invalidation msgs exchanged by the Hermes Protocol w/ RMWs  
    [type: {"RINV"},       flagRMW   : {0,1}, \* RMW change
                           epochID   : 0..(Cardinality(H_NODES) - 1),
                           sender    : H_NODES,
                           version   : 0..H_MAX_VERSION,
                           tieBreaker: H_NODES] 

HRts == [version: 0..H_MAX_VERSION,
         tieBreaker: H_NODES]

HRTypeOK ==  \* The type correctness invariant
    /\  HTypeOK
    /\  Rmsgs           \subseteq HRMessage
    /\  nodeFlagRMW     \in [H_NODES -> {0,1}]
    /\  committedRMWs   \subseteq HRts
    /\  committedWrites \subseteq HRts
    
HRSemanticsRMW ==  \* The invariant that an we cannot have two operations committed 
                   \* with same versions (i.e., that read the same value unless they are both writes)
    /\ \A x \in committedRMWs:
        \A y \in committedWrites: /\ x.version /= y.version
                                  /\ x.version /= y.version - 1
    /\ \A x,y \in committedRMWs: \/ x.version /= y.version
                                 \/ x.tieBreaker = y.tieBreaker
HRInit == \* The initial predicate
    /\  HInit
    /\  Rmsgs       = {}
    /\  committedRMWs   = {}
    /\  committedWrites = {}
    /\  nodeFlagRMW = [n \in H_NODES |-> 0]  \* RMW change
    
    
-------------------------------------------------------------------------------------

\* A buffer maintaining all Invalidation  messages. Messages are only appended to this variable (not 
\* removed once delivered) intentionally to check protocols tolerance in dublicates and reorderings 
HRsend(m) == Rmsgs' = Rmsgs \union {m}  

hr_upd_nothing ==
    /\ UNCHANGED <<nodeFlagRMW, Rmsgs, committedRMWs, committedWrites>>

hr_completeWrite(ver, tieB) ==
    /\ committedWrites' = committedWrites \union {[version |-> ver, tieBreaker |-> tieB]} 
    /\ UNCHANGED <<Rmsgs, nodeFlagRMW, committedRMWs>>

hr_completeRMW(ver, tieB) ==
    /\ committedRMWs' = committedRMWs \union {[version |-> ver, tieBreaker |-> tieB]} 
    /\ UNCHANGED <<Rmsgs, nodeFlagRMW, committedWrites>>


-------------------------------------------------------------------------------------
\* Helper functions 
hr_upd_state(n, newVersion, newTieBreaker, newState, newAcks, flagRMW) == 
    /\  nodeFlagRMW'      = [nodeFlagRMW     EXCEPT ![n] = flagRMW] \* RMW change
    /\  h_upd_state(n, newVersion, newTieBreaker, newState, newAcks)

hr_send_inv(n, newVersion, newTieBreaker, flagRMW) ==  
    /\  HRsend([type        |-> "RINV",
                epochID     |-> epochID, \* we always use the latest epochID
                flagRMW     |-> flagRMW, \* RMW change
                sender      |-> n,
                version     |-> newVersion, 
                tieBreaker  |-> newTieBreaker])              

hr_actions_for_upd(n, newVersion, newTieBreaker, newState, newAcks, flagRMW) == \* Execute a write
    /\  hr_upd_state(n, newVersion, newTieBreaker, newState, newAcks, flagRMW)
    /\  hr_send_inv(n, newVersion, newTieBreaker, flagRMW)
    /\  UNCHANGED <<aliveNodes, epochID, msgs, committedRMWs, committedWrites>>
 

hr_actions_for_upd_replay(n, acks) == \* Apply a write-replay using same TS (version, Tie Breaker) 
                                \* and either reset acks or keep already gathered acks
    /\  hr_actions_for_upd(n, nodeTS[n].version, nodeTS[n].tieBreaker, "replay", acks, nodeFlagRMW[n])
 
 
-------------------------------------------------------------------------------------
\* Coordinator functions 

HRWrite(n) == \* Execute a write
\*    /\  nodeState[n]      \in {"valid", "invalid"}
    \* writes in invalid state are also supported as an optimization
    /\  nodeState[n]            = "valid"
    /\  nodeTS[n].version + 2 <= H_MAX_VERSION \* Only to configurably terminate the model checking 
    /\  hr_actions_for_upd(n, nodeTS[n].version + 2, n, "write", {}, 0)
   
HRRMW(n) == \* Execute an RMW
    /\  nodeState[n]            = "valid"
    /\  nodeTS[n].version + 1 <= H_MAX_VERSION \* Only to configurably terminate the model checking 
    /\  hr_actions_for_upd(n, nodeTS[n].version + 1, n, "write", {}, 1)
               
HRWriteReplay(n) == \* Execute a write-replay
    /\  nodeState[n] \in {"write", "replay"}
    /\  nodeWriteEpochID[n] < epochID
    /\  ~receivedAllAcks(n) \* optimization to not replay when we have gathered acks from all alive
    /\  nodeFlagRMW[n] = 0
    /\  hr_actions_for_upd_replay(n, nodeRcvedAcks[n])

HRRMWReplay(n) == \* Execute an RMW-replay
    /\  nodeState[n] \in {"write", "replay"}
    /\  nodeWriteEpochID[n] < epochID
    /\  ~receivedAllAcks(n) \* optimization to not replay when we have gathered acks from all alive
    /\  nodeFlagRMW[n] = 1
    /\  hr_actions_for_upd_replay(n, {})

\* Keep the HRead, HRcvAck and HSendVals the same as Hermes w/o RMWs
HRRead(n) == 
    /\ HRead(n)
    /\ hr_upd_nothing 
    
HRRcvAck(n) == 
    /\ HRcvAck(n)
    /\ hr_upd_nothing 
    
HRSendValsRMW(n) == 
    /\ nodeFlagRMW[n] = 1
    /\ HSendVals(n)
    /\ hr_completeRMW(nodeTS[n].version, nodeTS[n].tieBreaker)

HRSendValsWrite(n) == 
    /\ nodeFlagRMW[n] = 0
    /\ HSendVals(n)
    /\ hr_completeWrite(nodeTS[n].version, nodeTS[n].tieBreaker)

HRCoordinatorActions(n) ==   \* Actions of a read/write/RMW coordinator 
    \/ HRRead(n)          
    \/ HRRMWReplay(n)
    \/ HRWriteReplay(n) 
    \/ HRWrite(n)      
    \/ HRRMW(n)      
    \/ HRRcvAck(n)
    \/ HRSendValsRMW(n)
    \/ HRSendValsWrite(n)
    
-------------------------------------------------------------------------------------               
\* Follower functions 
hr_upd_state_greater_inv(n) ==
        IF      nodeState[n] \in {"valid", "invalid", "replay"}
        THEN    
            nodeState' = [nodeState EXCEPT ![n] = "invalid"]
        ELSE IF nodeState[n] \in {"write", "invalid_write"} /\ nodeFlagRMW[n] = 0  
        THEN
            nodeState' = [nodeState EXCEPT ![n] = "invalid_write"] 
        ELSE \* nodeState[n] \in {"write"} /\ nodeFlagRMW[n] = 1 
            nodeState' = [nodeState EXCEPT ![n] = "invalid"]    
        

HRRcvWriteInv(n) ==  \* Process a received invalidation for a write
    \E m \in Rmsgs: 
        /\ m.type = "RINV"
        /\ m.epochID  = epochID
        /\ m.sender /= n
        /\ m.flagRMW = 0 \* RMW change
        \* always acknowledge a received invalidation (irrelevant to the timestamp)
        /\ h_send_inv_or_ack(n, m.version, m.tieBreaker, "ACK") 
        /\ IF greaterTS(m.version, m.tieBreaker,
                        nodeTS[n].version, nodeTS[n].tieBreaker)
           THEN 
                /\ nodeLastWriter' = [nodeLastWriter EXCEPT ![n] = m.sender]
                /\ nodeFlagRMW'    = [nodeFlagRMW    EXCEPT ![n] = m.flagRMW] \* RMW change            
                /\ nodeTS' = [nodeTS EXCEPT ![n].version    = m.version,
                                          ![n].tieBreaker = m.tieBreaker]
                /\ hr_upd_state_greater_inv(n)
           ELSE
                /\ UNCHANGED <<nodeState, nodeTS, nodeLastWriter, nodeFlagRMW>>
        /\ UNCHANGED <<nodeLastWriteTS, aliveNodes, nodeRcvedAcks, Rmsgs, 
                       epochID, nodeWriteEpochID, committedRMWs, committedWrites>>
 
HRRcvRMWInv(n) ==  \* Process a received invalidation for a write
    \E m \in Rmsgs: 
        /\ m.type = "RINV"
        /\ m.epochID  = epochID
        /\ m.sender /= n
        /\ m.flagRMW = 1        
        /\ IF greaterTS(m.version, m.tieBreaker,
                        nodeTS[n].version, nodeTS[n].tieBreaker)
           THEN
                /\ nodeLastWriter' = [nodeLastWriter EXCEPT ![n] = m.sender]
                /\ nodeFlagRMW'    = [nodeFlagRMW    EXCEPT ![n] = m.flagRMW] \* RMW change            
                /\ nodeTS' = [nodeTS EXCEPT ![n].version    = m.version,
                                          ![n].tieBreaker = m.tieBreaker]
                \* acknowledge a received invalidation (w/ greater timestamp)
                /\ h_send_inv_or_ack(n, m.version, m.tieBreaker, "ACK") 
                /\ hr_upd_state_greater_inv(n)
                /\ UNCHANGED <<Rmsgs>>
            ELSE IF equalTS(m.version, m.tieBreaker,
                            nodeTS[n].version, nodeTS[n].tieBreaker)
            THEN
                \* acknowledge a received invalidation (w/ equal timestamp)
                /\ h_send_inv_or_ack(n, m.version, m.tieBreaker, "ACK") 
                /\ UNCHANGED <<nodeState, nodeTS, nodeLastWriter, nodeFlagRMW, Rmsgs>>
            ELSE \* smaller TS
                /\ hr_send_inv(n, nodeTS[n].version, nodeTS[n].tieBreaker, nodeFlagRMW[n])
                /\ UNCHANGED <<nodeState, nodeTS, nodeLastWriter, nodeFlagRMW, msgs>>
        /\ UNCHANGED <<nodeLastWriteTS, aliveNodes, nodeRcvedAcks, epochID, 
                       nodeWriteEpochID, committedRMWs, committedWrites>> 
 
         
\* Keep the HRcvVals the same as Hermes w/o RMWs
HRRcvVal(n) == 
    /\ HRcvVal(n)
    /\ hr_upd_nothing
    
    
HRFollowerWriteReplay(n) == \* Execute a write-replay when coordinator failed
    /\  nodeState[n] \in {"invalid", "invalid_write"}
    /\  ~isAlive(nodeLastWriter[n])
    /\  hr_actions_for_upd_replay(n, {})
                           

HRFollowerActions(n) ==  \* Actions of a write follower
    \/ HRFollowerWriteReplay(n)
    \/ HRRcvWriteInv(n)
    \/ HRRcvRMWInv(n)
    \/ HRRcvVal(n) 
-------------------------------------------------------------------------------------                       

HRNodeFailure(n) == 
    /\ nodeFailure(n)
    /\ hr_upd_nothing
    
    
HRNext == \* Hermes (read,write RMWs) protocol (Coordinator and Follower actions) + failures
    \E n \in aliveNodes:       
            \/ HRFollowerActions(n)
            \/ HRCoordinatorActions(n)
            \/ HRNodeFailure(n) 
            
            
\* Hermes w/ RMW Spec
HRSpec == HRInit /\ [][HRNext]_hrvars
THEOREM HRSpec =>([]HRTypeOK) /\ ([]HConsistent) /\ ([]HRSemanticsRMW)

\* A hacky way to run Hermes w/o RMWs from the same model
HSpec == HRInit /\ [][HNext /\ hr_upd_nothing]_hrvars
THEOREM HSpec =>([]HRTypeOK) /\ ([]HConsistent)

=============================================================================


================================================
FILE: tla/README.md
================================================
# Hermes-Protocol
TLA spec - Hermes: fault-tolerant replication protocol with strong consistency and high performance

---
Warning 
protocol-actions png contains some optimizations over the Hermes protocol presented 
in the paper such as issuing writes while being in Invalid state.