[
  {
    "path": ".clang-format",
    "content": "﻿---\nBasedOnStyle: Chromium\nAlignAfterOpenBracket: Align\nAlignConsecutiveDeclarations: 'false'\nAlignEscapedNewlines: Left\nAlignOperands: 'true'\nAllowShortFunctionsOnASingleLine: All\nAllowShortIfStatementsOnASingleLine: WithoutElse\nAlwaysBreakAfterDefinitionReturnType: TopLevel\nAlwaysBreakTemplateDeclarations: 'Yes'\nBinPackArguments: 'true'\nBinPackParameters: 'true'\nBreakBeforeBraces: WebKit\nCompactNamespaces: 'false'\nCpp11BracedListStyle: 'true'\nIndentWrappedFunctionNames: 'false'\nLanguage: Cpp\nNamespaceIndentation: None\nSpaceAfterTemplateKeyword: 'true'\nSpaceBeforeAssignmentOperators: 'true'\nSpaceBeforeCpp11BracedList: 'true'\nSpaceBeforeParens: ControlStatements\nSpaceInEmptyParentheses: 'false'\nSpacesInAngles: 'false'\nSpacesInParentheses: 'false'\nSpacesInSquareBrackets: 'false'\nUseTab: Never\n\n...\n"
  },
  {
    "path": ".gitignore",
    "content": "# ignore temporary files\n.*.swp\n\\#*#\n*.pyc\n*.o\n*.hi\n*.dump\n*.log\n*.rej\n*.orig\n*.patch\n*.diff\n.tags*\n\n# ignore executables\n/src/mica/test\n/src/libhrd/main\n/src/herd-hybrid/main\n/src/herd-UD/main\nsrc/Armonia/main\n/src/CR/cr\n/src/hermes/hermes\n/src/hades/hades\n/src/hermes/hermes-wings\n\n# ignore debug files\n/debug/*.txt\n# ignore traces\n/traces/*.txt\n# ignore ide files\n/.idea/\n/cmake-build-debug/\n/src/cmake-build-debug/\n/src/.idea/\n/src/cache/cmake-build-debug/\n/src/cache/.idea/\n/src/Armonia/armonia-ec\n/src/Armonia/armonia-sc\n/src/Armonia/throughput.txt\n/src/herd-UD/throughput.txt\n/bin/traces\n#/results/*\n/exec/results/*.txt\n/exec/results/xput/*.txt\n/exec/results/xput/*.csv\n/exec/results/xput/per-node/*.csv\n/exec/results/xput/per-node/*.txt\n/exec/results/xput/all-nodes/*.txt\n/exec/results/latency/*.txt\n/exec/results/latency/*.csv\n/results/*.txt\n/results/xput/*.txt\n/results/xput/*.csv\n/results/xput/per-node/*.csv\n/results/xput/per-node/*.txt\n/results/xput/all-nodes/*.txt\n/results/latency/*.txt\n/results/latency/*.csv\ntraces/trace-parts/*\n/results/scattered-results/*\n/results/aggregated-system-results/*.csv\n/traces/system-traces/*.txt\n/traces/current-splited-traces/*.txt\n/traces/*.txt\ntraces/\n./exec/hermesKV\n./exec/rCRAQ\n./exec/hades\n"
  },
  {
    "path": "AUTHORS",
    "content": "Run `git shortlog -se` for an up-to-date list of contributors.\n---\n\nPrincipal authors: Antonios Katsarakis  <antonios.katsarakis AT ed.ac.uk>\n\t\t   Vasilis  Gavrielatos <vasilis.gavrielatos AT ed.ac.uk>\n"
  },
  {
    "path": "CMakeLists.txt",
    "content": "######################################################################################\n# WARNING: DO NOT MAKE through cmake use the Makefile in /exec/ to compile instead!!!!\n######################################################################################\n\ncmake_minimum_required(VERSION 2.8.12)\nproject(hermes)\n\nset(Hermes_VERSION_MAJOR 1)\nset(Hermes_VERSION_MINOR 0)\n\ninclude_directories(include/hermes\n                    include/libhrd\n                    /usr/include/\n                    include/optik\n        include/mica-herd)\n\nset(CMAKE_C_STANDARD 11)\nset(CMAKE_C_FLAGS \"${CMAKE_C_FLAGS} -Wall\")\n\nset(SOURCE_FILES_cr\n        #Source files\n        src/CR/cr_worker.c\n\n        src/wings/wings.c\n\n        src/hermes/main.c\n        src/hermes/stats.c\n        src/hermes/spacetime.c\n\n        src/mica-herd/mica.c\n        src/mica-herd/city.c\n        src/mica-herd/herd.c\n\n\n        ##### header files ####\n        include/wings/wings.h\n        include/wings/wings_api.h\n\n        include/mica-herd/city.h\n        include/mica-herd/hrd.h\n        include/mica-herd/sizes.h\n\n        include/hermes/util.h\n        include/hermes/config.h\n        include/utils/bit_vector.h\n        include/utils/concur_ctrl.h\n        src/CR/crKV.c)\n\n\nset(SOURCE_FILES_hades\n        #Source files\n        src/wings/wings.c\n        src/hades/hades.c\n\n        ##### header files ####\n        include/wings/wings_api.h\n        include/wings/wings.h\n        include/hades/hades.h\n        src/hades/test.c)\n\n\nset(SOURCE_FILES_hermes\n        #Source files\n        src/hermes/main.c\n        src/hermes/util.c\n#        src/hermes/worker.c\n        src/hermes/hermes_worker.c\n        src/hermes/stats.c\n        src/hermes/spacetime.c\n        src/mica-herd/herd.c\n        src/mica-herd/mica.c\n        src/mica-herd/city.c\n\n        src/wings/wings.c\n\n        ##### header files ####\n        include/mica-herd/hrd.h\n        include/mica-herd/city.h\n        include/mica-herd/sizes.h\n\n        include/hermes/util.h\n        include/hermes/config.h\n        include/utils/concur_ctrl.h\n        include/utils/bit_vector.h\n        include/hades/hades.h\n        include/wings/wings.h\n        include/wings/wings_api.h src/hermes/hermesKV.c)\n\n\nadd_executable(cr ${SOURCE_FILES_cr})\nadd_executable(hades ${SOURCE_FILES_hades})\nadd_executable(hermes ${SOURCE_FILES_hermes})\ntarget_link_libraries(cr pthread ibverbs rt memcached numa rdmacm)\ntarget_link_libraries(hades pthread ibverbs rt memcached numa rdmacm)\ntarget_link_libraries(hermes pthread ibverbs rt memcached numa rdmacm)\n\n"
  },
  {
    "path": "LICENSE",
    "content": "\n                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "README.md",
    "content": "# Hermes Reliable Replication Protocol\n\n<img align=\"left\" height=\"160\" src=\"https://github.com/akatsarakis/Hermes/blob/master/hermes-logo.png\">\n\nThis is the publicly available artifact repository supporting the ASPLOS'20 paper [_\"Hermes: A Fast, Fault-Tolerant and Linearizable Replication Protocol\"_](http://hermes-protocol.com \"Hermes Arxiv version\"). The repository contains both code to experimentally evaluate Hermes(KV) and complete Hermes TLA+ specifications which can be used to verify Hermes correctness via model-checking.\n\n[![top picks](https://badgen.net/badge/honorable%20mention/top%20picks%20'20/d99e14)](https://www.sigarch.org/call-contributions/ieee-micro-top-picks/)\n[![available](https://badgen.net/badge/acm%20badge/available/117c00)](https://www.acm.org/publications/policies/artifact-review-badging#available)\n[![functional](https://badgen.net/badge/acm%20badge/functional/FB1f44)](https://www.acm.org/publications/policies/artifact-review-badging#functional)\n[![stars](https://badgen.net/github/stars/ease-lab/Hermes)]()\n\n[![license](https://badgen.net/badge/webpage/Hermes/blue)](http://hermes-protocol.com/)\n[![license](https://badgen.net/badge/license/Apache%202.0/blue)](https://github.com/ease-lab/Hermes/blob/master/LICENSE)\n[![last commit](https://badgen.net/github/last-commit/ease-lab/Hermes)]()\n<a href=\"https://twitter.com/intent/follow?screen_name=ease_lab\" target=\"_blank\">\n<img src=\"https://img.shields.io/twitter/follow/ease_lab?style=social&logo=twitter\" alt=\"follow on Twitter\"></a>\n\n\n## Citation\n```\n@inbook{Katsarakis:20,\nauthor = {Katsarakis, Antonios and Gavrielatos, Vasilis and Katebzadeh, M.R. Siavash and Joshi, Arpit and Dragojevic, Aleksandar and Grot, Boris and Nagarajan, Vijay},\ntitle = {Hermes: A Fast, Fault-Tolerant and Linearizable Replication Protocol},\nyear = {2020},\npublisher = {Association for Computing Machinery},\naddress = {New York, NY, USA},\nbooktitle = {Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems},\npages = {201–217},\nnumpages = {17}\n}\n```\n\n----\n## High Perfomance Features\n- _Reads_: i) Local ii) Load-balanced (served by any replica)\n- _Updates (Writes and RMWs)_: i) Inter-key concurrent ii) Decentralized iii) Fast (1rtt commit -- any replica)\n- _Writes_: iv) Non-conflicting (i.e., never abort)\n\n## Consistency and Properties\nLinearizable reads, writes and RMWs with the following properties:\n1. _Writes_: from a live replica _always commit_ after Invalidating (and getting acknowledgments from) the rest live replicas. \n1. _RMWs_: at most one of possible concurrent RMWs to a key can commit, and this only once all acknowledgments from live replicas are gathered.\n1. _Reads_: return the local value if the targeted keys are found in the Valid state and the coordinator was considered live at the time of reading. The later can be ensured locally if the coordinator has a lease for (and is part of) the membership.\n\n## Fault Tolerance\nCoupling Invalidations with per-key logical timestamps (i.e., Lamport clocks) and propagating the value to be updated with the invalidation message (_early value propagation_), Hermes allows any replica blocked by an update (write or RMW) to safely replay the update and unblock it self and the rest of followers.\n\n----\n\n## Hardware dependencies\n\nA homogeneous cluster of x86_64 nodes interconnected via RDMA network cards and switched \n(tested on \"Mellanox ConnectX-4\" Infiniband infrastructure).\n\n\n## Software requirements\n\nLinux OS (tested on Ubuntu 18.04 4.15.0-55-generic) with root access.\n\nThe software is tested using the following version of Mellanox OFED RDMA drivers\n`MLNX_OFED_LINUX-4.4-2.0.7.0`.\n\nThird-party libraries that you will require to run the experiments include:\n1. _parallel_ (Cluster management scripts only)\n1. _libmemcached-dev_ (used to exchange QP informations for the setup of RDMA connections)\n1. _libnuma-dev_\t(for mbind)\n\n\n## Setup\n\nOn every node:\n1. Install Mellanox OFED ibverbs drivers\n1. `./hermes/bin/setup.sh`\n\nOn manager (just pick on node in the cluster):\n1. Fill variables in `/hermes/exec/hosts.sh`\n1. Configure setup and default parameters in `/hermes/include/hermes/config.h`\n1. From `/hermes/exec/` compile _hermesKV_ through make\n1. scp  _hermesKV_ and the configured hosts.sh in the `/hermes/exec/` directory of all other nodes in the cluster. \n\n\n## Compilation\n\n`cd hermes/exec; make`\n\n_Warning_: Do not compile through cmake; instead use the Makefile in exec/ directory.\n\n\n## Run\n\nRun first on manager:\n`./run-hermesKV.sh <experiment_parameters>`\n\nThen run on all other member nodes \n`./run-hermesKV.sh <experiment_parameters>`\n\n> Note that some members will eagerly terminate if experiment \n  uses smaller number of nodes than specified in hosts.sh\n  \nAn experiment example for three nodes 12 worker threads and 35% write ratio would be as follows:\n`./run-hermesKV.sh -W 12 -w 350 -M 3`\nSupported command-line arguments for the experiments are detailed in the run-hermesKV.sh script.\n\n\n---\n## Acknowledgments\n Hermes is based on [HERD/MICA](https://github.com/efficient/HERD \"Apache 2.0\") design as an underlying KVS, the code of which we have adapted to implement HermesKV.\n\n## Other Implementations of Hermes\n\n- [Odyssey](https://github.com/vasigavr1/Odyssey) - Hermes is also implemed in the Odyssey framework by [Vasilis Gavrielatos](https://github.com/vasigavr1)\n- [Olympus](https://github.com/sadraskol/olympus) - in Rust by [Thomas Bracher](https://twitter.com/sadraskol)\n\n\n## Contact\n Antonios Katsarakis: <a href=\"http://antonis.io/\" title=\"Personal webpage\" target=\"_blank\">`antonis.io`</a> |  [`antoniskatsarakis@yahoo.com`](mailto:antoniskatsarakis@yahoo.com?subject=[GitHub]%20Zeus%20Specification \"Email\")\n"
  },
  {
    "path": "bin/copy-exec-files.sh",
    "content": "#!/usr/bin/env bash\n\nFILES_TO_CPY=(\n        \"hosts.sh\"\n        \"run.sh\"\n        \"run-hermesKV.sh\"\n        \"hermesKV\"\n        \"run-rCRAQ.sh\"\n        \"rCRAQ\"\n#        \"hades\"\n#        \"run-hades.sh\"\n      )\n\nEXEC_FOLDER=\"${HOME}/hermes/exec\"\n\ncd $EXEC_FOLDER\n# get Hosts\nsource ../exec/hosts.sh\nmake clean; make\ncd -\n\nfor FILE in \"${FILES_TO_CPY[@]}\"\ndo\n\tparallel scp ${EXEC_FOLDER}/${FILE} {}:${EXEC_FOLDER}/${FILE} ::: $(echo ${REMOTE_HOSTS[@]})\n\techo \"${FILE} copied to {${REMOTE_HOSTS[@]}}\"\ndone\n\n"
  },
  {
    "path": "bin/copy-n-exec-hermesKV.sh",
    "content": "#!/usr/bin/env bash\n\n### Runs to make\n#declare -a write_ratios=(0 10 50 200 500 1000)\ndeclare -a write_ratios=(1000)\ndeclare -a rmw_ratios=(0)\n#declare -a num_workers=(5 10 15 20 25 30 36)\ndeclare -a num_workers=(1)\n#declare -a batch_sizes=(25 50 75 100 125 150 200 250)\ndeclare -a batch_sizes=(50)\ndeclare -a credits=(50)\n#declare -a coalesce=(1 5 10 15)\ndeclare -a coalesce=(15)\n#declare -a num_machines=(2 3 5 7)\ndeclare -a num_machines=(5)\n\n# Set LAT_WORKER to -1 to disable latency measurement or to worker id (i.e., from 0 up to [num-worker - 1])\nLAT_WORKER=\"-1\"\n#LAT_WORKER=\"0\"\n\nEXEC_FOLDER=\"${HOME}/hermes/exec\"\n\nREMOTE_COMMAND=\"cd ${EXEC_FOLDER}; bash run-hermesKV.sh\"\n\nPASS=\"${1}\"\nif [ -z \"$PASS\" ]\nthen\n      echo \"\\$PASS is empty! --> sudo pass for remotes is expected to be the first arg\"\n      exit;\nfi\n\necho \"\\$PASS is OK!\"\ncd ${EXEC_FOLDER}\n\n# get Hosts\nsource ./hosts.sh\n\n../bin/copy-exec-files.sh\n\n      # Execute locally and remotely\nfor M in \"${num_machines[@]}\"; do\n    for RMW in \"${rmw_ratios[@]}\"; do\n      for WR in \"${write_ratios[@]}\"; do\n        for W in \"${num_workers[@]}\"; do\n          for BA in \"${batch_sizes[@]}\"; do\n            for CRD in \"${credits[@]}\"; do\n              for COAL in \"${coalesce[@]}\"; do\n                 args=\" -M ${M} -R ${RMW} -w ${WR} -W ${W} -b ${BA} -c ${CRD} -C ${COAL} -l ${LAT_WORKER}\"\n                 echo ${PASS} | ./run-hermesKV.sh ${args} &\n                 sleep 2 # give some leeway so that manager starts before executing the members\n\t             parallel \"echo ${PASS} | ssh -tt {} $'${REMOTE_COMMAND} ${args}'\" ::: $(echo ${REMOTE_HOSTS[@]}) >/dev/null\n\t          done\n\t        done\n\t      done\n\t    done\n\t  done\n\tdone\ndone\n\ncd - >/dev/null\n\n../bin/get-system-xput-files.sh\n"
  },
  {
    "path": "bin/copy-n-exec-rCRAQ.sh",
    "content": "#!/usr/bin/env bash\n\nUSE_SAME_BATCH_N_CREDITS=0\n\n### Runs to make\ndeclare -a write_ratios=(1000)\n#declare -a num_workers=(5 10 15 20 25 30 36)\ndeclare -a num_workers=(1)\n#declare -a batch_sizes=(25 50 75 100 125 150 200 250)\ndeclare -a batch_sizes=(50)\ndeclare -a credits=(15) # WARNING credits for CR must be divided by the num_machines (i.e., credits % num_machines == 0)\n#declare -a coalesce=(1 5 10 15)\ndeclare -a coalesce=(10)\n#declare -a num_machines=(2 3 5 7)\ndeclare -a num_machines=(3)\n\n# Set LAT_WORKER to -1 to disable latency measurement or to worker id (i.e., from 0 up to [num-worker - 1])\nLAT_WORKER=\"-1\"\n#LAT_WORKER=\"0\"\n\n#LOCAL_HOST=`hostname`\nEXEC_FOLDER=\"${HOME}/hermes/exec\"\nREMOTE_COMMAND=\"cd ${EXEC_FOLDER}; bash run-rCRAQ.sh\"\n\nPASS=\"${1}\"\nif [ -z \"$PASS\" ]\nthen\n      echo \"\\$PASS is empty! --> sudo pass for remotes is expected to be the first arg\"\n      exit;\nfi\n\necho \"\\$PASS is OK!\"\ncd ${EXEC_FOLDER}\n\n# get Hosts\nsource ./hosts.sh\n\n../bin/copy-exec-files.sh\n\nif [ ${USE_SAME_BATCH_N_CREDITS} -eq 0 ]\nthen\n   for M in \"${num_machines[@]}\"; do\n       # Execute locally and remotely\n       for WR in \"${write_ratios[@]}\"; do\n        for W in \"${num_workers[@]}\"; do\n          for BA in \"${batch_sizes[@]}\"; do\n            for CRD in \"${credits[@]}\"; do\n              for COAL in \"${coalesce[@]}\"; do\n                 args=\" -M ${M} -w ${WR} -W ${W} -b ${BA} -c ${CRD} -C ${COAL} -l ${LAT_WORKER}\"\n                 echo ${PASS} | ./run-rCRAQ.sh ${args} &\n                 sleep 2\n\t             parallel \"echo ${PASS} | ssh -tt {} $'${REMOTE_COMMAND} ${args}'\" ::: $(echo ${REMOTE_HOSTS[@]}) >/dev/null\n\t          done\n\t        done\n\t      done\n\t    done\n\t   done\n   done\n\nelse\n       # Execute locally and remotely\n   for M in \"${num_machines[@]}\"; do\n       for WR in \"${write_ratios[@]}\"; do\n        for W in \"${num_workers[@]}\"; do\n          for BA in \"${batch_sizes[@]}\"; do\n              for COAL in \"${coalesce[@]}\"; do\n                 args=\" -M ${M} -w ${WR} -W ${W} -b ${BA} -c ${BA} -C ${COAL} -l ${LAT_WORKER}\"\n                 echo ${PASS} | ./run-rCRAQ.sh ${args} &\n                 sleep 2\n\t             parallel \"echo ${PASS} | ssh -tt {} $'${REMOTE_COMMAND} ${args}'\" ::: $(echo ${REMOTE_HOSTS[@]}) >/dev/null\n\t          done\n\t        done\n\t      done\n\t    done\n   done\nfi\n\ncd - >/dev/null\n\n../bin/get-system-xput-files.sh\n"
  },
  {
    "path": "bin/copy-traces.sh",
    "content": "#!/usr/bin/env bash\n\n# Copy (per-thread splitted) trace folder\nFOLDERS_TO_CPY=( \"traces/current-splitted-traces\" )\nHOME_FOLDER=\"${HOME}/hermes\"\n\ncd ${HOME_FOLDER} >/dev/null\n# get Hosts\nsource ./exec/hosts.sh\ncd - >/dev/null\n\nfor FOLDER in \"${FOLDERS_TO_CPY[@]}\"\ndo\n\tparallel scp -r ${HOME_FOLDER}/${FOLDER} {}:${HOME_FOLDER}/${FOLDER} ::: $(echo ${REMOTE_HOSTS[@]})\n\techo \"${FOLDER} copied to {${REMOTE_HOSTS[@]}}\"\ndone\n"
  },
  {
    "path": "bin/csv_latency_parser.py",
    "content": "#!/usr/bin/python\n\nimport sys, os, ntpath, getopt\n\n\"\"\"\n========\nParser for aggregated over time results\n========\n\"\"\"\nclass LatencyParser:\n    def __init__(self):\n        self.latency_values = []\n        self.reads = []\n        self.max_read_latency = 0\n        self.max_write_latency = 0\n        self.writes = []\n        self.all_reqs = []\n        self.parseInputStats()\n        self.printAllStats()\n       # self.printStats(all_reqs)\n\n    def printStats(self, array, max_latency):\n        self.avgLatency(array)\n        #self.percentileLatency(array, 20)\n        self.percentileLatency(array, 50)\n        self.percentileLatency(array, 90)\n        self.percentileLatency(array, 95)\n        self.percentileLatency(array, 99)\n        #self.percentileLatency(array, 99.9)\n        #self.percentileLatency(array, 99.99)\n        #self.percentileLatency(array, 99.999)\n        #self.percentileLatency(array, 99.9999)\n        #self.percentileLatency(array, 100)\n        print \"Max Latency: \", max_latency, \"us\"\n\n    def printAllStats(self):\n        print \"~~~~~~ Write Stats ~~~~~~~\"\n        self.printStats(self.writes, self.max_write_latency)\n        print \"\\n~~~~~~ Read Stats ~~~~~~~~\"\n        self.printStats(self.reads, self.max_read_latency)\n        print \"\\n~~~~~~ Overall Stats ~~~~~~~~~\"\n        self.printStats(self.all_reqs, max(self.max_read_latency, self.max_write_latency))\n\n\n    def avgLatency(self, array):\n        cummulative = 0 \n        total_reqs = 0 \n        for x in xrange(len(self.latency_values)):\n            cummulative = self.latency_values[x] * array[x] + cummulative \n            total_reqs += array[x]\n        if total_reqs > 0:\n            print \"Reqs measured: \", total_reqs, \"| Avg Latency: \", cummulative / total_reqs\n        else:\n            print \"No reqs measured\"\n\n    def percentileLatency(self, array, percentage):\n        total_reqs = 0\n        sum_reqs = 0\n        for x in xrange(len(self.latency_values)):\n            #cummulative = self.latency_values[x] * array[x] + cummulative \n            total_reqs += array[x]\n        if total_reqs > 0:\n            if percentage == 100:\n                for x in reversed(xrange(len(self.latency_values))):\n                    if array[x] > 0:\n                        if self.latency_values[x] == -1:\n                            print percentage, \"%: >\", self.latency_values[x-1], \"us\"\n                        else:\n                            print percentage, \"%: \", self.latency_values[x], \"us\"\n                    return\n            else:\n                for x in xrange(len(self.latency_values)):\n                    sum_reqs += array[x]\n                    if ((100.0 * sum_reqs) / total_reqs) >= percentage:\n                        if self.latency_values[x] == -1:\n                            print percentage, \"%: >\", self.latency_values[x-1], \"us\"\n                        else:\n                            print percentage, \"% : \", self.latency_values[x], \"us\"\n                        return\n        else:\n            print \"No reqs measured\"\n\n    def parseInputStats(self):\n        lr_lines = 0\n        for line in sys.stdin:                  # input from standard input\n            if line[0] == '#':\n                continue\n            (command, words) = line.strip().split(\":\",1)\n            command = command.strip()\n            if command == 'reads':\n                words = words.strip().split(\",\")\n                #if int(words[0].strip()) != -1:\n                self.latency_values.append(int(words[0].strip()))\n                self.reads.append(int(words[1].strip()))\n                self.all_reqs.append(int(words[1].strip()))\n            elif command == 'writes':\n                words = words.strip().split(\",\")\n                self.writes.append(int(words[1].strip()))\n                self.all_reqs[lr_lines] = self.all_reqs[lr_lines] + self.writes[-1]\n                lr_lines = lr_lines + 1\n            elif command == 'reads-hl':\n                words = words.strip().split(\",\")\n                self.max_read_latency = int(words[0].strip())\n            elif command == 'writes-hl':\n                words = words.strip().split(\",\")\n                self.max_write_latency = int(words[0].strip())\n\nif __name__ == '__main__':\n    LatencyParser()\n"
  },
  {
    "path": "bin/exec-derecho.sh",
    "content": "#!/usr/bin/env bash\nHOSTS=( ##### network  cluster #####\n         \"houston\"\n         \"sanantonio\"\n         \"austin\"\n         \"indianapolis\"\n         \"philly\"\n#         \"atlanta\"\n         ##### compute cluster #####\n#         \"baltimore\"\n#         \"chicago\"\n#         \"detroit\"\n        )\n\nNUM_NODES=5\nNUM_SENDERS=0 #0 - all senders, 1 - half senders, 2 - one sender\nREQS_PER_SENDER=10000000\n\n### Runs to make\n#declare -a delivery_mode=(0 1) #0 - ordered mode, 1 - unordered mode\n#declare -a object_size=(40 1024)\n#declare -a window_size=(128 256)\ndeclare -a delivery_mode=(0) #0 - ordered mode, 1 - unordered mode\ndeclare -a object_size=(256 1024)\ndeclare -a window_size=(128 256)\ndeclare -a iterations=(1 2 3 4) #(1 2 3) for 3 iterations\n\nif [[ $NUM_NODES -ne ${#HOSTS[@]} ]] ; then\n    echo \"Num_nodes($NUM_NODES) !=  #Hosts(${#HOSTS[@]})\"\n    exit 1\nfi\n\nLOCAL_HOST=`hostname`\nHOME_FOLDER=\"${HOME}/derecho-unified/Release/applications/tests/performance_tests/\"\n#pin derecho threads to cores (w/o using hyperthreads) of numa node 0\nCOMMAND_NO_ARGS=\"taskset -c 0,2,4,6,8,10,12,14,16,18 ./bandwidth_test \"\n\ntotal_iters=0\ncd ${HOME_FOLDER} >/dev/null\n# Execute locally and remotely\nfor del_mode in \"${delivery_mode[@]}\"; do\n  for obj_size in \"${object_size[@]}\"; do\n    for win_size in \"${window_size[@]}\"; do\n        for iter in \"${iterations[@]}\"; do\n\t        total_iters=$((total_iters + 1))\n\n            args=\"--DERECHO/max_payload_size=${obj_size} --DERECHO/window_size=${win_size} -- ${NUM_NODES} ${NUM_SENDERS} ${REQS_PER_SENDER} ${del_mode}\"\n            COMMAND=\" ${COMMAND_NO_ARGS} ${args}\"\n\n            echo \"Running Derecho with: delivery_mode:${del_mode} obj size: $obj_size, window_size: $win_size nodes: $NUM_NODES \"\n            ${COMMAND} >/dev/null &\n            sleep 1\n\n\t        parallel \"ssh -tt {} $'cd ${HOME_FOLDER}; ${COMMAND}'\" ::: $(echo ${HOSTS[@]/$LOCAL_HOST}) >/dev/null\n\t        sleep 9 # give local node some leeway to log the results into a file\n        done\n    done\n  done\ndone\ntail -${total_iters} data_derecho_bw\ncd - >/dev/null\n"
  },
  {
    "path": "bin/format.sh",
    "content": "#!/bin/bash\n\nSCRIPT_DIR=\"$(dirname \"$0\")\"\ncd \"${SCRIPT_DIR}\"\n\nFORMAT_FILES_IN_DIRECTORIES=\"../src/ ../include/\"\n\nclang-format --version > /dev/null || exit 1\n\nif [ \"$1\" = \"check\" ]; then # Check clang-format has been applied!\n  find ${FORMAT_FILES_IN_DIRECTORIES} \\\n    -regex '.*\\.\\(cpp\\|hpp\\|cc\\|cxx\\)' \\\n    -exec clang-format -style=file -output-replacements-xml -i {} \\; |\n    grep -c \"<replacement \" >/dev/null\n\n  if [ $? -ne 1 ]; then\n    echo \"Format check: Failed!\"\n    echo \" -- Files do not match clang-format. Run bin/format.sh before adding files to git!\"\n    exit 1\n  else\n    echo \"Format check: Passed!\"\n  fi\n\nelse # Apply clang-format to all files\n\n  find ${FORMAT_FILES_IN_DIRECTORIES} \\\n    -regex '.*\\.\\(c\\|h\\|cpp\\|hpp\\|cc\\|cxx\\)' \\\n    -exec clang-format -style=file -i {} \\;\nfi\n"
  },
  {
    "path": "bin/get-system-xput-files.sh",
    "content": "#!/usr/bin/env bash\n\nEXEC_FOLDER=\"${HOME}/hermes/exec\"\nRESULTS_FOLDER=\"${HOME}/hermes/exec/results\"\n\nRESULT_FOLDER=\"${RESULTS_FOLDER}/xput/per-node/\"\nRESULT_OUT_FOLDER=\"${RESULTS_FOLDER}/xput/per-node/\"\nRESULT_OUT_FOLDER_MERGE=\"${RESULTS_FOLDER}/xput/all-nodes/\"\n\ncd ${EXEC_FOLDER} >/dev/null\n# get Hosts\nsource ./hosts.sh\ncd - >/dev/null\n\n# Gather remote files\nparallel \"scp {}:${RESULT_FOLDER}* ${RESULT_OUT_FOLDER} \" ::: $(echo ${REMOTE_HOSTS[@]})\necho \"xPut result files copied from: {${REMOTE_HOSTS}}\"\n\n# group all files\nls ${RESULT_OUT_FOLDER} | awk -F '-' '!x[$2]++{print $1}' | while read -r line; do\n    # Create an intermediate file print the 3rd line for all files with the same prefix to the same file\n    awk 'FNR==3 {print $0}' ${RESULT_OUT_FOLDER}/$line* > ${RESULT_OUT_FOLDER_MERGE}/$line-inter.txt\n          #   Sum up the xPut of the (3rd iteration) from every node to create the final file\n    awk -F ':' '{sum += $2} END {print sum}' ${RESULT_OUT_FOLDER_MERGE}/$line-inter.txt > ${RESULT_OUT_FOLDER_MERGE}/$line.txt\n    rm -rf  ${RESULT_OUT_FOLDER_MERGE}/$line-inter.txt\ndone\n\necho \"System-wide xPut results produced in ${RESULT_OUT_FOLDER_MERGE} directory!\"\n"
  },
  {
    "path": "bin/setup.sh",
    "content": "#!/usr/bin/env bash\n# Exec this script in every cluster node after you have\n# installed the (Infiniband) Verbs drivers through Mellanox OFED:\n# 1. Download the MLNX_OFED (tested on --> MLNX_OFED_LINUX-4.4-2.0.7.0-ubuntu18.04-x86_64)\n#    https://www.mellanox.com/page/products_dyn?product_family=26\n# 2. tar -xvf the tar file\n# 3. install through --> sudo ./mlnxofedinstall\n\nif ! [ -x \"$(command -v ofed_info)\" ]; then\n    echo \"Error: mellanox ofed is not installed.\" >&2\n    echo \" Please install the (Infiniband) Verbs drivers through Mellanox OFED by:\"\n    echo \"  1. Download the MLNX_OFED (tested on --> MLNX_OFED_LINUX-4.4-2.0.7.0-ubuntu18.04-x86_64)\"\n    echo \"     https://www.mellanox.com/page/products_dyn?product_family=26\"\n    echo \"  2. tar -xvf the tar file\"\n    echo \"  3. install through --> sudo ./mlnxofedinstall\"\n    exit 1\nelse\n    MLNX_OFED_VERSION=`ofed_info | head -1`\n    echo \"Running OFED driver version: ${MLNX_OFED_VERSION}\" >&2\nfi\n\n# Install required Libraries (memcached is used to setup RDMA connection and numa for mbind)\nsudo apt --yes install libmemcached-dev libnuma-dev memcached\n\n# start a subnet manager\nsudo /etc/init.d/opensmd start # there must be at least one subnet-manager in an infiniband subnet cluster\n# start the driver\nsudo /etc/init.d/openibd start\n\n# Configure (2MB) huge-pages for the KVS\n# Note that such a huge page allocation is not permanent and must be re-applied after a node reboot.\n#echo 8192 | sudo tee /sys/devices/system/node/node*/hugepages/hugepages-2048kB/nr_hugepages\necho 4096 | sudo tee /sys/devices/system/node/node*/hugepages/hugepages-2048kB/nr_hugepages\necho 10000000001 | sudo tee /proc/sys/kernel/shmmax\necho 10000000001 | sudo tee /proc/sys/kernel/shmall\n"
  },
  {
    "path": "bin/trace-spliter.sh",
    "content": "#!/usr/bin/env bash\n\nINPUT_DIR=\"${HOME}/hermes/traces/system-traces/\"\nINPUT_FILENAME=\"simple_trace_w_100000000_k_1000000_a_0.99.txt\"\nOUTPUT_DIR=\"${HOME}/hermes/traces/current-splited-traces/\"\nOUTPUT_PREFIX=\"t_\"\nOUTPUT_SUFFIX=\"_a_0.99.txt\"\n\nMAX_NUM_NODES=10\nMAX_THREADS_PER_NODE=40\n\n\nCHUNKS=$(expr ${MAX_NUM_NODES} \\* ${MAX_THREADS_PER_NODE})\nLINES=$(wc -l ${INPUT_DIR}/${INPUT_FILENAME} | cut -d ' ' -f1)\n\necho \"Splitting trace with $LINES lines into $CHUNKS (per-thread) chunks ...\"\n\nsplit -l  $(expr ${LINES} / ${CHUNKS}) \\\n      -a 4 -d \\\n      --additional-suffix=${OUTPUT_SUFFIX} \\\n      ${INPUT_DIR}/${INPUT_FILENAME} \\\n      ${OUTPUT_DIR}/${OUTPUT_PREFIX}\n"
  },
  {
    "path": "exec/Makefile",
    "content": "CPPFLAGS  := -O3 #-Wno-unused-result -Wall -Werror\nLD      := gcc -O3 -flto\nLDFLAGS := ${LDFLAGS} -libverbs -lrt -lpthread -lmemcached -lnuma # -lrdmacm --> TODO we do not use hw multicast because it helps only on master-based patterns\nCFLAGS   =  -I../include/mica-herd -I../include/hermes -I../include/wings -I../include/hades\nAPPS    := hermesKV rCRAQ\nPROF    := -g -fno-omit-frame-pointer\n\nall: ${APPS} clean-o\n\nhermesKV: ../src/wings/wings.o ../src/hades/hades.o \\\n          ../src/mica-herd/herd.o ../src/mica-herd/mica.o ../src/mica-herd/city.o \\\n          ../src/hermes/main.o ../src/hermes/hermes_worker.o ../src/hermes/util.o \\\n          ../src/hermes/stats.o ../src/hermes/spacetime.o ../src/hermes/hermesKV.o\n\t${LD} -o $@ $^ ${LDFLAGS}\n\n\nrCRAQ: ../src/mica-herd/herd.o ../src/mica-herd/mica.o \\\n       ../src/mica-herd/city.o ../src/hermes/main.o ../src/CR/cr_worker.o ../src/CR/crKV.o \\\n       ../src/hermes/spacetime.o ../src/hermes/util.o ../src/hermes/stats.o  ../src/wings/wings.o\n\t${LD} -o $@ $^ ${LDFLAGS}\n\n\nhades-exec: ../src/hades/hades.o ../src/hades/test.o ../src/wings/wings.o ../src/mica-herd/herd.o\n\t${LD} -o hades $^ ${LDFLAGS}\n\nhades: hades-exec clean-o\n\nPHONY: clean\nclean:\n\t@rm -f ../src/hermes/*.o ../src/mica-herd/*.o ../src/wings/*.o \\\n\t      ../src/CR/*.o ../src/hades/*.o ${APPS} hades\n\nclean-o:\n\t@rm -f ../src/hermes/*.o ../src/mica-herd/*.o ../src/wings/*.o \\\n\t      ../src/CR/*.o ../src/hades/*.o"
  },
  {
    "path": "exec/hosts.sh",
    "content": "#!/usr/bin/env bash\n\n\nALL_IPS=(\n### TO BE FILLED: Please provide all cluster IPs\n    # Node w/ first IP (i.e., \"manager\") must run script before the rest of the nodes\n    # (instantiates a memcached to setup RDMA connections)\n    #\n        10.0.3.1\n        10.0.3.2\n        10.0.3.3\n        10.0.3.4\n        10.0.3.5\n        )\n\n### TO BE FILLED: Modify to get the local IP of the node running the script (must be one of the cluster nodes)\nLOCAL_IP=$(ip addr | grep 'state UP' -A2 | grep 'inet 10.0.3'| awk '{print $2}' | cut -f1  -d'/')\n#LOCAL_IP=\"129.215.164.2\"\n\n### Fill the RDMA device name (the \"hca_id\" of the device when executing ibv_devinfo)\n#NET_DEVICE_NAME=\"mlx5_0\"\nNET_DEVICE_NAME=\"mlx4_0\"\n\n##########################################\n### NO NEED TO CHANGE BELOW THIS POINT ###\n##########################################\n\nREMOTE_IPS=${ALL_IPS[@]/$LOCAL_IP}\nREMOTE_HOSTS=${ALL_IPS[@]/$LOCAL_IP}\n\nNODE_ID=-1\n\nfor i in \"${!ALL_IPS[@]}\"; do\n\tif [  \"${ALL_IPS[i]}\" ==  \"$LOCAL_IP\" ]; then\n\t\tNODE_ID=$i\n\tfi\ndone\n\n\nif [[ ${NODE_ID} == -1 ]]; then\n    echo \"Error Local IP: ${LOCAL_IP} n is not in ALL_IPS:\"\n    echo \"    {${ALL_IPS[@]}}\"\n    exit\nfi\n\necho \"Local node id:\" ${NODE_ID}\n"
  },
  {
    "path": "exec/results/latency/.gitinclude",
    "content": ""
  },
  {
    "path": "exec/results/xput/all-nodes/.gitkeep",
    "content": ""
  },
  {
    "path": "exec/results/xput/per-node/.gitkeep",
    "content": ""
  },
  {
    "path": "exec/run-hades.sh",
    "content": "#!/usr/bin/env bash\n\nsource run.sh\n\nblue \"Running hades\"\n\nsudo LD_LIBRARY_PATH=/usr/local/lib/ -E \\\n\t./hades                             \\\n\t--machine-id ${NODE_ID}             \\\n\t--dev-name ${NET_DEVICE_NAME}       \\\n\t2>&1\n"
  },
  {
    "path": "exec/run-hermesKV.sh",
    "content": "#!/usr/bin/env bash\n\nsource run.sh\n\n#### Get CLI arguments\n# Use -1 for the default (#define in config.h) values if not argument is passed\nCREDITS=\"-1\"\nNUM_WORKERS=\"-1\"\nWRITE_RATIO=\"-1\"\nMAX_COALESCE=\"-1\"\nMAX_BATCH_SIZE=\"-1\"\nRMW_RATIO=\"-1\"\nNUM_MACHINES=\"-1\"\nLAT_WORKER=\"-1\"\n\n# Each letter is an option argument, if it's followed by a collum\n# it requires an argument. The first colum indicates the '\\?'\n# help/error command when no arguments are given\nwhile getopts \":W:w:l:R:C:c:b:M:h\" opt; do\n  case $opt in\n     W)\n       NUM_WORKERS=$OPTARG # Number of threads: this must be smaller than MAX_WORKERS_PER_MACHINE of config.h\n       ;;\n     w)\n       WRITE_RATIO=$OPTARG # given number is divided by 10 to give write rate % (i.e., 55 means 5.5 % writes)\n       ;;\n     R)\n       RMW_RATIO=$OPTARG # percentage of writes to be rmws (i.e., -w 500 -R 500 means 25 % of RMWs and 25% of writes)\n                         # RMW is disabled by default (no usage through the artifact) can be enabled through config.h)\n       ;;\n     C)\n       MAX_COALESCE=$OPTARG # maximum number of readily-available messages to be \"batched\" in a network packet\n                            # must be smaller than MTU and it is capped by MAX_REQ_COALESCE in config.h\n       ;;\n     c)\n       CREDITS=$OPTARG      # maximum number of credits per node per thread; credits correspond to messages and not packets\n                            # it is capped by MAX_CREDITS_PER_REMOTE_WORKER in config.h\n       ;;\n     b)\n       MAX_BATCH_SIZE=$OPTARG   # amount of requests and protocol messages that can be batched to the KVS\n                                # it is capped by MAX_BATCH_KVS_OPS_SIZE in config.h\n       ;;\n     M)\n       NUM_MACHINES=$OPTARG # it is capped by MAX_MACHINE_NUM in config.h and the number of IPS as indicated in hosts.sh\n       ;;\n     l)\n       LAT_WORKER=$OPTARG # An id of the worker who is measuring the latency\n                          # if -1 Latency is disabled\n                          # otherwise it is capped by running worker threads (NUM_WORKERS-1)\n       ;;\n     h)\n      echo \"Usage: -W <# workers> -w <write ratio>  (x1000 --> 10 for 1%)\"\n      echo \"       -c <# credits> -b <max batch size> -C <max coalescing>\"\n      echo \"       -M <# nodes>   -l <latency worker> -R <rmw ratio>\"\n      exit 1\n      ;;\n    \\?)\n      echo \"Invalid option: -$OPTARG use -h to get info for arguments\" >&2\n      exit 1\n      ;;\n    :)\n      echo \"Option -$OPTARG requires an argument.\" >&2\n      exit 1\n      ;;\n  esac\ndone\n\n\nblue \"Running hermes threads\"\nsudo LD_LIBRARY_PATH=/usr/local/lib/ -E \\\n    ./hermesKV                          \\\n\t--machine-id ${NODE_ID}             \\\n\t--is-roce 0                         \\\n\t--dev-name ${NET_DEVICE_NAME}       \\\n\t--num-machines ${NUM_MACHINES}      \\\n\t--num-workers  ${NUM_WORKERS}       \\\n\t--lat-worker   ${LAT_WORKER}        \\\n\t--rmw-ratio    ${RMW_RATIO}         \\\n\t--write-ratio  ${WRITE_RATIO}       \\\n\t--credits      ${CREDITS}           \\\n\t--max-coalesce ${MAX_COALESCE}      \\\n\t--max-batch-size ${MAX_BATCH_SIZE}  \\\n\t--hermes                            \\\n\t2>&1\n"
  },
  {
    "path": "exec/run-rCRAQ.sh",
    "content": "#!/usr/bin/env bash\n\nsource run.sh\n\n\n#### Get CLI arguments\n# Use -1 for the default (#define in config.h) values if not argument is passed\nCREDITS=\"-1\"\nNUM_WORKERS=\"-1\"\nWRITE_RATIO=\"-1\"\nMAX_COALESCE=\"-1\"\nMAX_BATCH_SIZE=\"-1\"\nRMW_RATIO=\"-1\"\nNUM_MACHINES=\"-1\"\nLAT_WORKER=\"-1\"\n\n# Each letter is an option argument, if it's followed by a collum\n# it requires an argument. The first colum indicates the '\\?'\n# help/error command when no arguments are given\nwhile getopts \":W:w:C:c:b:M:l:h\" opt; do\n  case $opt in\n     W)\n       NUM_WORKERS=$OPTARG\n       ;;\n     w)\n       WRITE_RATIO=$OPTARG\n       ;;\n     C)\n       MAX_COALESCE=$OPTARG\n       ;;\n     c)\n       CREDITS=$OPTARG\n       ;;\n     b)\n       MAX_BATCH_SIZE=$OPTARG\n       ;;\n     M)\n       NUM_MACHINES=$OPTARG\n       ;;\n     l)\n       LAT_WORKER=$OPTARG\n       ;;\n     h)\n      echo \"Usage: -W <# workers> -w <write ratio>  (x1000 --> 10 for 1%)\"\n      echo \"       -c <# credits> -b <max batch size> -C <max coalescing>\"\n      echo \"       -M <# nodes>   -l <latency worker> \"\n      exit 1\n      ;;\n    \\?)\n      echo \"Invalid option: -$OPTARG use -h to get info for arguments\" >&2\n      exit 1\n      ;;\n    :)\n      echo \"Option -$OPTARG requires an argument.\" >&2\n      exit 1\n      ;;\n  esac\ndone\n\nblue \"Running hermes threads\"\n\nsudo LD_LIBRARY_PATH=/usr/local/lib/ -E \\\n\t./rCRAQ                             \\\n\t--machine-id ${NODE_ID}             \\\n\t--is-roce 0                         \\\n\t--dev-name ${NET_DEVICE_NAME}       \\\n\t--num-machines ${NUM_MACHINES}      \\\n\t--num-workers  ${NUM_WORKERS}       \\\n\t--lat-worker   ${LAT_WORKER}        \\\n\t--rmw-ratio    ${RMW_RATIO}         \\\n\t--write-ratio  ${WRITE_RATIO}       \\\n\t--credits      ${CREDITS}           \\\n\t--max-coalesce ${MAX_COALESCE}      \\\n\t--max-batch-size ${MAX_BATCH_SIZE}  \\\n\t2>&1\n"
  },
  {
    "path": "exec/run.sh",
    "content": "#!/usr/bin/env bash\n\nsource ./hosts.sh\n\nexport HRD_REGISTRY_IP=\"${ALL_IPS[0]}\" # I.E. first IP node (HOUSTON) has a memcached server (used to initialize RDMA QPs)\nexport MLX5_SINGLE_THREADED=1\nexport MLX5_SCATTER_TO_CQE=1\n\nsudo killall memcached\nsudo killall hades\nsudo killall rCRAQ\nsudo killall hermesKV\n\n# A function to echo in blue color\nfunction blue() {\n\tes=`tput setaf 4`\n\tee=`tput sgr0`\n\techo \"${es}$1${ee}\"\n}\n\n\n#### free the pages workers use\nblue \"Removing SHM keys used by HermesKV/rCRAQ\"\nfor i in `seq 0 28`; do\n\tkey=`expr 3185 + $i`\n\tsudo ipcrm -M $key 2>/dev/null\n\tkey=`expr 4185 + $i`\n\tsudo ipcrm -M $key 2>/dev/null\ndone\n: ${HRD_REGISTRY_IP:?\"Need to set HRD_REGISTRY_IP non-empty\"}\n\n\nblue \"Reset server QP registry\"\nmemcached -l ${HRD_REGISTRY_IP} 1>/dev/null 2>/dev/null &\nsleep 1\n"
  },
  {
    "path": "include/hades/hades.h",
    "content": "//\n// Created by akatsarakis on 17/01/19.\n//\n\n#ifndef HADES_H\n#define HADES_H\n\n#include \"../../include/wings/wings.h\"\n#include \"../utils/bit_vector.h\"\n#include \"../utils/time_rdtsc.h\"\n// Send heartbeats\n// Recv heartbeats\n// Change View\n// Update local membership\n\n// (Ostracism)\n// arbitration --> a node provides an obolus\n\n// all nodes are able to communicate w/ each other\n\n// fd provides a view as a membership change\n// only as long as it differs with the current view\n// and agrees with a majority of other node views.\n\n// The update granularity of local view works as a lease\n// to membership changes which prevents sequentially\n// consistent reads in the presence of network partitions\n//       I.E. a node in a minority partition is able to detect\n//       that cannot reach the majority of nodes and stops serving\n//       local reads, maintaining linearizability (instead of sequential\n//       consistency) For this\n\n// Epochs\n\n// Guarantees Nodes in the same EPOCH id have the same group view\n\n#define ENABLE_ARBITRATION 1\n\n// Hades debug Tests\n#define FAKE_LINK_FAILURE 0\n#define FAKE_LINK_FAILURE_AFTER_SEC 15\n#define STOP_FAKE_LINK_FAILURE_AFTER_SEC 20\n#define FAKE_ONE_WAY_LINK_FAILURE 0\n#define FAKE_LINK_FAILURE_NODE_A 2\n#define FAKE_LINK_FAILURE_NODE_B 1\nstatic_assert(FAKE_LINK_FAILURE_NODE_A != FAKE_LINK_FAILURE_NODE_B, \"\");\n\ntypedef struct {\n  uint8_t node_id : 8;\n  uint8_t epoch_id : 8;\n  uint8_t same_w_local_membership : 1;\n  uint8_t have_ostracised_for_dst_node : 7;\n  bit_vector_t view;\n} __attribute__((packed)) hades_view_t;\nstatic_assert(sizeof(hades_view_t) <= 4,\n              \"Currently send using a 4B header only field (RDMA immediate)\");\n\ntypedef struct {\n  hades_view_t last_local_view;\n  hades_view_t intermediate_local_view;\n\n  bit_vector_t curr_g_membership;\n  uint8_t nodes_in_membership;\n\n  uint8_t max_num_nodes;\n  uint8_t* recved_views_flag;\n  hades_view_t* remote_recved_views;\n\n  // Polling\n  uint16_t max_views_to_poll;\n  hades_view_t* poll_buff;  // used for polling remote views\n\n  // Timing\n  uint32_t send_view_every_us;\n  uint32_t update_local_view_every_ms;\n  struct timespec* ts_last_send;  // issues views to remotes iff have not send a\n                                  // view within the predefined timeout\n  struct timespec\n      ts_last_view_change;  // update views and possible changes membership iff\n                            // pre-defined timeout is exceed\n\n  // Ostracism\n  uint8_t*\n      have_ostracized_for;  // an array storing info whether or not in a view\n                            // the sender ostracized someone for this node\n} hades_ctx_t;\n\ntypedef struct {\n  hades_ctx_t ctx;\n  ud_channel_t* hviews_c;\n  ud_channel_t* hviews_crd_c;\n} hades_wings_ctx_t;\n\nvoid* hades_full_thread(void* node_id);\nuint16_t poll_for_remote_views(hades_wings_ctx_t* hw_ctx);\nvoid update_view_and_issue_hbs(hades_wings_ctx_t* hw_ctx);\n\ninline static void\nhades_ctx_init(hades_ctx_t* ctx, uint8_t node_id, uint8_t max_nodes,\n               uint16_t max_views_to_poll, uint32_t send_view_us,\n               uint32_t update_local_view_ms)\n{\n  assert(max_views_to_poll > 0);\n\n  ctx->intermediate_local_view.epoch_id = 0;\n  ctx->intermediate_local_view.node_id = node_id;\n  ctx->nodes_in_membership = 1;\n  bv_init(&ctx->curr_g_membership);\n  bv_bit_set(&ctx->curr_g_membership, node_id);\n  bv_init(&ctx->intermediate_local_view.view);\n  bv_bit_set(&ctx->intermediate_local_view.view, node_id);\n  ctx->last_local_view = ctx->intermediate_local_view;\n\n  ctx->max_num_nodes = max_nodes;\n  ctx->recved_views_flag = malloc(sizeof(uint8_t) * max_nodes);\n  ctx->remote_recved_views = malloc(sizeof(hades_view_t) * max_nodes);\n  for (int i = 0; i < max_nodes; ++i) {\n    ctx->recved_views_flag[i] = 0;\n    bv_init(&ctx->remote_recved_views[i].view);\n  }\n\n  ctx->max_views_to_poll = max_views_to_poll;\n  ctx->poll_buff = malloc(sizeof(hades_view_t) * max_views_to_poll);\n\n  // Setup timers\n  init_rdtsc(1, 0);  /// WARNING: this is not thread safe!!\n  get_rdtsc_timespec(&ctx->ts_last_view_change);\n  ctx->ts_last_send = malloc(sizeof(struct timespec) * max_nodes);\n  for (int i = 0; i < max_nodes; ++i)\n    get_rdtsc_timespec(&ctx->ts_last_send[i]);\n\n  ctx->send_view_every_us = send_view_us;\n  ctx->update_local_view_every_ms = update_local_view_ms;\n  assert(2 * 1000 * update_local_view_ms > send_view_us);\n\n  // Ostracism\n  ctx->have_ostracized_for = malloc(sizeof(uint8_t) * max_nodes);\n  for (int i = 0; i < max_nodes; ++i)\n    ctx->have_ostracized_for[i] = 0;\n}\n\n// WARNING: hades wings_ctx_init initializes only the first part of the\n// required channels wings_setup_channel_qps_and_recvs must be called by\n// the application afterwards to finish the initialization of wings.\ninline static void\nhades_wings_ctx_init(hades_wings_ctx_t* wctx, uint8_t node_id,\n                     uint8_t max_nodes, uint16_t max_views_to_poll,\n                     uint32_t send_view_us, uint32_t update_local_view_ms,\n                     ud_channel_t* hviews_c, ud_channel_t* hviews_crd_c,\n                     uint16_t worker_lid)\n{\n  hades_ctx_init(&wctx->ctx, node_id, max_nodes, max_views_to_poll,\n                 send_view_us, update_local_view_ms);\n\n  wctx->hviews_c = hviews_c;\n  wctx->hviews_crd_c = hviews_crd_c;\n\n  const uint8_t is_bcast = 0;\n  const uint8_t stats_on = 1;\n  const uint8_t prints_on = 1;\n  const uint8_t is_hdr_only = 1;\n  const uint8_t expl_crd_ctrl = 1;\n  const uint8_t enable_inlining = 1;\n  const uint8_t disable_crd_ctrl = 0;\n  const uint8_t credits =\n      (const uint8_t)(2 * update_local_view_ms * 1000 / send_view_us);\n\n  char qp_name[200];\n  sprintf(qp_name, \"%s%d\", \"\\033[1m\\033[32mHades\\033[0m\", worker_lid);\n\n  wings_ud_channel_init(\n      wctx->hviews_c, qp_name, REQ, 1, sizeof(hades_view_t) - sizeof(uint8_t),\n      0, enable_inlining, is_hdr_only, is_bcast, disable_crd_ctrl,\n      expl_crd_ctrl, wctx->hviews_crd_c, credits, max_nodes,\n      (uint8_t)machine_id, stats_on, prints_on);\n}\n\n// How does somebody joins?\n// epoch id 0\n// must see at least a majority of views with same epoch id > 0\n// || majority of views with epoch id 0\n#endif  // HADES_H\n"
  },
  {
    "path": "include/hermes/config.h",
    "content": "//\n// Created by akatsarakis on 15/03/18.\n//\n\n#ifndef SPACETIME_CONFIG_H\n#define SPACETIME_CONFIG_H\n#include <assert.h>\n#include <stdint.h>\n#include \"sizes.h\"\n\n// MAX_ defines are treated as DEFAULT_ as well (i.e., if not altered by CLI\n// args)\n\n/*-------------------------------------------------\n------------ SETUP & DEFAULT SETTINGS -------------\n--------------------------------------------------*/\n#define MAX_MACHINE_NUM 5           // maximum nodes\n#define MAX_WORKERS_PER_MACHINE 15  // maximum number of threads per node\n#define DEFAULT_WORKERS_PER_MACHINE 2\n#define DEFAULT_THREAD_OF_STAT_THREAD \\\n  (15)  // WARNING make sure this is not co-located with a worker thread\n\n// Number of sockets (numa nodes), cores and h/w threads per core on each node\n#define TOTAL_THREADS_PER_CORE 2\n#define TOTAL_CORES_PER_SOCKET 10\n#define TOTAL_NUMBER_OF_SOCKETS 2\n\n/*-------------------------------------------------\n-------------------------------------------------\n-------------------------------------------------\n-------- No need to change beyond this point ----\n-------------------------------------------------\n-------------------------------------------------\n--------------------------------------------------*/\n\n// Default workload writes / updates accesses (the rest are reads)\n#define DEFAULT_UPDATE_RATIO 1000  // is divided by 10 (i.e., 25 --> 2.5 %)\n// both writes and RMWs (RMW_RATIO inderectly provides WRITE_RATIO)\n\n#define ENABLE_RMWs \\\n  0  // if RMWs is not enabled then all UPDATE_RATIO == WRITE_RATIO\n#define DEFAULT_RMW_RATIO 0  // is divided by 10 (i.e., 25 --> 2.5 %)\n// percentage of UPDATE_RATIO to be RMWs\n\n// Max operations per-thread to batches to the KVS (either received packets or\n// read/write/RMW requests)\n#define MAX_BATCH_KVS_OPS_SIZE 250\nstatic_assert(MAX_WORKERS_PER_MACHINE <= 254, \"\");\nstatic_assert(MAX_WORKERS_PER_MACHINE <= TOTAL_NUMBER_OF_SOCKETS *\n                                             TOTAL_THREADS_PER_CORE *\n                                             TOTAL_CORES_PER_SOCKET,\n              \"\");\nstatic_assert(DEFAULT_UPDATE_RATIO <= 1000 && DEFAULT_RMW_RATIO >= 0, \"\");\n\n/*-------------------------------------------------\n----------------- RDMA SETTINGS -------------------\n--------------------------------------------------*/\n// Request coalescing (max --readily available-- messages to batch in a single\n// RDMA packet)\n#define MAX_REQ_COALESCE 15\n\n// Flow control\n#define MAX_CREDITS_PER_REMOTE_WORKER (MAX_REQ_COALESCE)\n\n// Request inlining\n#define DISABLE_INLINING 0\n\n/*-------------------------------------------------\n----------------- SECONDARY SETTINGS --------------\n--------------------------------------------------*/\n// LATENCY\n#define DEFAULT_MEASURE_LATENCY 0\n#define DEFAULT_WORKER_MEASURING_LATENCY 0\n#define MAX_LATENCY 1000  // in us\n#define LATENCY_BUCKETS 1000\n#define LATENCY_PRECISION \\\n  (MAX_LATENCY / LATENCY_BUCKETS)  // latency granularity in us\n\n// FAIRNESS\n#define ENABLE_VIRTUAL_NODE_IDS 0  // 0\n#define VIRTUAL_NODE_IDS_PER_NODE 20\n\n// SKEW\n#define ENABLE_COALESCE_OF_HOT_REQS \\\n  0  // 0 //WARNING!!! this must be disabled for cr\n#define COALESCE_N_HOTTEST_KEYS 100\n#define ENABLE_READ_COMPLETE_AFTER_VAL_RECV_OF_HOT_REQS 0  // 1\n#define ENABLE_WRITE_COALESCE_TO_THE_SAME_KEY_IN_SAME_NODE 0\n\n// DEBUG\n#define ENABLE_ASSERTIONS 0\n#define DISABLE_VALS_FOR_DEBUGGING 0\n#define KEY_NUM 0  // use 0 to disable\n\n// REQUESTS\n#define FEED_FROM_TRACE 0\n#define ZIPF_EXPONENT_OF_TRACE \\\n  99  // if FEED_FROM_TRACE == 1 | this is divided by 100 (e.g. use 99 for  a =\n      // 0.99)\n#define NUM_OF_REP_REQS K_256  // if FEED_FROM_TRACE == 0\n#define USE_A_SINGLE_KEY 0     // if FEED_FROM_TRACE == 0\n#define ST_KEY_ID_255_OR_HIGHER 255\n\n/*-------------------------------------------------\n---------------- Debug and others -----------------\n--------------------------------------------------*/\n// DBG Prints\n/// Warning some prints assume that there are no faults (multiplications with\n/// REMOTE_MACHINES)\n#define MAX_THREADS_TO_PRINT 1\n#define ENABLE_REQ_PRINTS 0\n#define ENABLE_BATCH_OP_PRINTS 0\n#define ENABLE_INV_PRINTS 0\n#define ENABLE_ACK_PRINTS 0\n#define ENABLE_VAL_PRINTS 0\n\n// Stats prints\n#define PRINT_STATS_EVERY_MSECS 4000  // 5000 //10000 //10\n#define PRINT_WORKER_STATS 0\n\n// Stats\n#define EXIT_ON_STATS_PRINT 1\n#define PRINT_NUM_STATS_BEFORE_EXITING 5\n#define DUMP_XPUT_STATS_TO_FILE 1\n\n// FAILURE DETECTION (RM)\n#define ENABLE_HADES_FAILURE_DETECTION 0\n#define WORKER_WITH_FAILURE_DETECTOR 0\nstatic_assert(ENABLE_HADES_FAILURE_DETECTION == 0,\n              \"WARNING HADES is currently not working\");\n\n// FAKE NODE FAILURE\n#define FAKE_FAILURE 0\n#define NODE_TO_FAIL 2\n#define ROUNDS_BEFORE_FAILURE 2\n\n// Rarely (or never) change\n#define BASE_SHM_KEY 24\n#define WORKER_SL 0  // service level for the workers\n#define MAX_REMOTE_MACHINES (MAX_MACHINE_NUM - 1)\n#define HERMES_CEILING(x, y) (((x) + (y)-1) / (y))\n#define GROUP_MEMBERSHIP_ARRAY_SIZE \\\n  HERMES_CEILING(MAX_MACHINE_NUM, 8)  // assuming uint8_t\n#define TOTAL_HW_CORES \\\n  (TOTAL_THREADS_PER_CORE * TOTAL_CORES_PER_SOCKET * TOTAL_NUMBER_OF_SOCKETS)\nstatic_assert(MAX_WORKERS_PER_MACHINE < TOTAL_HW_CORES - 1,\n              \"Leave at least a hw thread free for OS etc..\");\n\n#define KV_SOCKET 0  // socket to allocate KVS (huge-)pages\n#define USE_ALL_SOCKETS 1\n#define ENABLE_HYPERTHREADING 1\n#define SOCKET_TO_START_SPAWNING_THREADS 0\n\n// Debug\n//#define SPACETIME DEBUG 2\n#ifndef SPACETIME_DEBUG\n#define SPACETIME_DEBUG 0\n#endif\n\n////////////////////////////////\n/// Hermes NOT TUNABLE\n////////////////////////////////\n/*-------------------------------------------------\n----------------- MAX HERMES OPS SIZE -------------\n--------------------------------------------------*/\n#define MAX_MSG_RECV_OPS_SIZE \\\n  (MAX_CREDITS_PER_REMOTE_WORKER * MAX_REMOTE_MACHINES * MAX_REQ_COALESCE)\n#define HERMES_MAX_BATCH_SIZE MAX(MAX_MSG_RECV_OPS_SIZE, MAX_BATCH_KVS_OPS_SIZE)\n\n/*-------------------------------------------------\n---------------- QPs Numbers ----------------------\n--------------------------------------------------*/\ntypedef enum {\n  INV_UD_QP_ID = 0,\n  ACK_UD_QP_ID,\n  VAL_UD_QP_ID,\n  CRD_UD_QP_ID,\n  END_HERMES_QPS_ENUM\n} hermes_qps_enum;\n// QPs\n#define TOTAL_WORKER_UD_QPs END_HERMES_QPS_ENUM\n#define TOTAL_WORKER_N_FAILURE_DETECTION_UD_QPs \\\n  (TOTAL_WORKER_UD_QPs + (ENABLE_HADES_FAILURE_DETECTION ? 2 : 0))\n\n/*-------------------------------------------------\n----------------- CR CONFIGURATION ----------------\n--------------------------------------------------*/\n#define CR_ENABLE_REMOTE_READS 0\n#define CR_REMOTE_READS_CREDITS 20\n\n#define MAX_CREDITS_PER_REMOTE_WORKER_CR 250  //(MAX_BATCH_KVS_OPS_SIZE) // CR\n\n#define CR_ACK_CREDITS (255)  // //(MAX_MACHINE_NUM * 255)\n\n#define CR_ENABLE_EARLY_INV_CRDS \\\n  1  // optimization to increase request pipelining\n\ntypedef enum {\n  CR_INV_UD_QP_ID = 0,\n#ifdef CR_ENABLE_EARLY_INV_CRDS\n  CR_INV_CRD_UD_QP_ID,\n#endif\n  CR_ACK_UD_QP_ID,\n  CR_REMOTE_WRITES_UD_QP_ID,\n  CR_REMOTE_WRITE_CRD_UD_QP_ID,\n  CR_REMOTE_READS_UD_QP_ID,\n  CR_REMOTE_READS_RESP_UD_QP_ID\n} cr_qps_enum;\n\n#define CR_TOTAL_WORKER_UD_QPs                              \\\n  (TOTAL_WORKER_UD_QPs + (CR_ENABLE_REMOTE_READS ? 2 : 0) + \\\n   (CR_ENABLE_EARLY_INV_CRDS ? 1 : 0))\n\n// Max CR batch op size\n#define MAX_MSG_RECV_OPS_SIZE_CR \\\n  (MAX_REQ_COALESCE * MAX_CREDITS_PER_REMOTE_WORKER_CR * MAX_REMOTE_MACHINES)\n#define CR_MAX_BATCH_SIZE MAX(MAX_MSG_RECV_OPS_SIZE_CR, MAX_BATCH_KVS_OPS_SIZE)\n\n// CR DEBUG\n#define CR_ENABLE_ONLY_HEAD_REQS 0\n#define CR_ENABLE_ALL_NODES_GETS_EXCEPT_HEAD 0\n#define CR_ENABLE_BLOCKING_INVALID_WRITES_ON_HEAD 0\n\n/*-------------------------------------------------\n----------------- Global Vars ---------------------\n--------------------------------------------------*/\n\nstruct thread_params {\n  int id;\n};\n\nstruct latency_counters {\n  uint32_t read_reqs[LATENCY_BUCKETS + 1];\n  uint32_t write_reqs[LATENCY_BUCKETS + 1];\n  int max_read_latency;\n  int max_write_latency;\n  long long total_measurements;\n};\n\nextern struct latency_counters latency_count;\n\n// global config (CLI) configurable vars\nextern uint8_t is_CR;\nextern int update_ratio;\nextern int rmw_ratio;\nextern int num_workers;\nextern int credits_num;\nextern int max_coalesce;\nextern int max_batch_size;  // for batches to KVS\n\nextern int machine_num;         // must be smaller or equal to MAX_MACHINE_NUM\nextern int remote_machine_num;  // must be smaller or equal to MAX_MACHINE_NUM\nextern int worker_measuring_latency;\n\n// extern int value_size; // must be smaller or equal to MAX_MACHINE_NUM\n\n#endif  // SPACETIME_CONFIG_H\n"
  },
  {
    "path": "include/hermes/inline-util.h",
    "content": "//\n// Created by akatsarakis on 23/05/18.\n//\n\n#ifndef HERMES_INLINE_UTIL_H\n#define HERMES_INLINE_UTIL_H\n\n#include <infiniband/verbs.h>\n#include \"../hades/hades.h\"\n#include \"../utils/concur_ctrl.h\"\n#include \"config.h\"\n#include \"spacetime.h\"\n#include \"util.h\"\n\n/* ---------------------------------------------------------------------------\n----------------------------------- MEMBERSHIP -------------------------------\n---------------------------------------------------------------------------*/\n\nstatic inline uint8_t\nnode_is_in_membership(spacetime_group_membership last_group_membership,\n                      int node_id)\n{\n  return (uint8_t)(bv_bit_get(last_group_membership.g_membership,\n                              (uint8_t)node_id) == 1\n                       ? 1\n                       : 0);\n}\n\nstatic inline void\ngroup_membership_update(hades_ctx_t hades_ctx)\n{\n  seqlock_lock(&group_membership.lock);\n\n  bv_copy((bit_vector_t*)&group_membership.g_membership,\n          hades_ctx.curr_g_membership);\n  bv_copy((bit_vector_t*)&group_membership.w_ack_init,\n          group_membership.g_membership);\n  bv_reverse((bit_vector_t*)&group_membership.w_ack_init);\n  bv_bit_set((bit_vector_t*)&group_membership.w_ack_init, (uint8_t)machine_id);\n\n  group_membership.num_of_alive_remotes =\n      bv_no_setted_bits(group_membership.g_membership);\n  seqlock_unlock(&group_membership.lock);\n\n  if (group_membership.num_of_alive_remotes < (machine_num / 2)) {\n    colored_printf(RED, \"Majority is down!\\n\");\n    exit(-1);\n  }\n}\n\nstatic inline uint8_t\ngroup_membership_has_changed(spacetime_group_membership* last_group_membership,\n                             uint16_t worker_lid)\n{\n  uint32_t debug_lock_free_membership_read_cntr = 0;\n  spacetime_group_membership lock_free_read_group_membership;\n\n  do {  // Lock free read of group membership\n    if (ENABLE_ASSERTIONS) {\n      debug_lock_free_membership_read_cntr++;\n      if (debug_lock_free_membership_read_cntr == M_4) {\n        printf(\"Worker %u stuck on a lock-free read (for group membership)\\n\",\n               worker_lid);\n        debug_lock_free_membership_read_cntr = 0;\n      }\n    }\n    lock_free_read_group_membership =\n        *((spacetime_group_membership*)&group_membership);\n  } while (!(seqlock_version_is_same_and_valid(\n      &group_membership.lock, &lock_free_read_group_membership.lock)));\n  for (int i = 0; i < GROUP_MEMBERSHIP_ARRAY_SIZE; i++)\n    if (!bv_are_equal(lock_free_read_group_membership.g_membership,\n                      last_group_membership->g_membership)) {\n      *last_group_membership = lock_free_read_group_membership;\n      return 1;\n    }\n  return 0;\n}\n\n/* ---------------------------------------------------------------------------\n----------------------------------- LATENCY -------------------------------\n---------------------------------------------------------------------------*/\n// Add latency to histogram (in microseconds)\nstatic inline void\nbookkeep_latency(int useconds, uint8_t op)\n{\n  uint32_t* latency_array;\n  int* max_latency_ptr;\n  switch (op) {\n    case ST_OP_PUT:\n      latency_array = latency_count.write_reqs;\n      max_latency_ptr = &latency_count.max_write_latency;\n      break;\n    case ST_OP_GET:\n      latency_array = latency_count.read_reqs;\n      max_latency_ptr = &latency_count.max_read_latency;\n      break;\n    default:\n      assert(0);\n  }\n  latency_count.total_measurements++;\n  if (useconds > MAX_LATENCY)\n    latency_array[LATENCY_BUCKETS]++;\n  else\n    latency_array[useconds / LATENCY_PRECISION]++;\n\n  if (*max_latency_ptr < useconds) *max_latency_ptr = useconds;\n}\n\n// Necessary bookkeeping to initiate the latency measurement\nstatic inline void\nstart_latency_measurement(struct timespec* start)\n{\n  clock_gettime(CLOCK_MONOTONIC, start);\n}\n\nstatic inline void\nstop_latency_measurment(uint8_t req_opcode, struct timespec* start)\n{\n  struct timespec end;\n  clock_gettime(CLOCK_MONOTONIC, &end);\n  int useconds = (int)(((end.tv_sec - start->tv_sec) * 1000000) +\n                       ((end.tv_nsec - start->tv_nsec) / 1000));\n  if (ENABLE_ASSERTIONS) assert(useconds >= 0);\n  //\tprintf(\"Latency of %s %u us\\n\", code_to_str(req_opcode), useconds);\n  bookkeep_latency(useconds, req_opcode);\n}\n\nstatic inline void\nstop_latency_of_completed_writes(spacetime_op_t* ops, uint16_t worker_lid,\n                                 struct timespec* stopwatch)\n{\n  if (machine_id == 0 && worker_lid == worker_measuring_latency)\n    if (ops[0].op_meta.opcode == ST_OP_PUT &&\n        (ops[0].op_meta.state == ST_MISS ||\n         ops[0].op_meta.state == ST_PUT_COMPLETE))\n      stop_latency_measurment(ops[0].op_meta.opcode, stopwatch);\n}\n\nstatic inline void\nstop_latency_of_completed_reads(spacetime_op_t* ops, uint16_t worker_lid,\n                                struct timespec* stopwatch)\n{\n  if (machine_id == 0 && worker_lid == worker_measuring_latency)\n    if (ops[0].op_meta.opcode == ST_OP_GET &&\n        (ops[0].op_meta.state == ST_MISS ||\n         ops[0].op_meta.state == ST_GET_COMPLETE))\n      stop_latency_measurment(ops[0].op_meta.opcode, stopwatch);\n}\n\n/* ---------------------------------------------------------------------------\n---------------------------------- Refill Requests ---------------------------\n---------------------------------------------------------------------------*/\nstatic inline int\nrefill_ops(uint32_t* trace_iter, uint16_t worker_lid,\n           struct spacetime_trace_command* trace, spacetime_op_t* ops,\n           uint32_t* refilled_per_ops_debug_cnt, struct timespec* start,\n           spacetime_op_t** n_hottest_keys_in_ops_get,\n           spacetime_op_t** n_hottest_keys_in_ops_put)\n{\n  static uint8_t first_iter_has_passed[MAX_WORKERS_PER_MACHINE] = {0};\n\n  int refilled_ops = 0, node_suspected = -1;\n  for (int i = 0; i < max_batch_size; i++) {\n    if (ENABLE_ASSERTIONS && first_iter_has_passed[worker_lid] == 1) {\n      assert(ops[i].op_meta.opcode == ST_OP_PUT ||\n             ops[i].op_meta.opcode == ST_OP_GET ||\n             (is_CR == 0 && ops[i].op_meta.opcode == ST_OP_RMW));\n      assert(ops[i].op_meta.state == ST_PUT_COMPLETE ||\n             ops[i].op_meta.state == ST_GET_COMPLETE ||\n             ops[i].op_meta.state == ST_PUT_SUCCESS ||\n             ops[i].op_meta.state == ST_REPLAY_SUCCESS ||\n             ops[i].op_meta.state == ST_NEW ||\n             ops[i].op_meta.state == ST_MISS ||\n             ops[i].op_meta.state == ST_PUT_STALL ||\n             ops[i].op_meta.state == ST_REPLAY_COMPLETE ||\n             ops[i].op_meta.state == ST_IN_PROGRESS_PUT ||\n             //<RMW>\n             ops[i].op_meta.state == ST_RMW_STALL ||\n             ops[i].op_meta.state == ST_RMW_ABORT ||\n             ops[i].op_meta.state == ST_RMW_SUCCESS ||\n             ops[i].op_meta.state == ST_RMW_COMPLETE ||\n             ops[i].op_meta.state == ST_IN_PROGRESS_RMW ||\n             //\t\t\t\t\t   ops[i].op_meta.state ==\n             // ST_IN_PROGRESS_PUT\n             //|| <RMW>\n             ops[i].op_meta.state == ST_IN_PROGRESS_GET ||\n             ops[i].op_meta.state == ST_IN_PROGRESS_REPLAY ||\n             ops[i].op_meta.state ==\n                 ST_OP_MEMBERSHIP_CHANGE ||  /// TODO check this\n             ops[i].op_meta.state ==\n                 ST_OP_MEMBERSHIP_COMPLETE ||  /// TODO check this\n             ops[i].op_meta.state == ST_PUT_COMPLETE_SEND_VALS ||\n             ops[i].op_meta.state == ST_GET_STALL);\n    }\n\n    if (first_iter_has_passed[worker_lid] == 0 ||\n        ops[i].op_meta.state == ST_MISS ||\n        ops[i].op_meta.state == ST_PUT_COMPLETE ||\n        ops[i].op_meta.state == ST_RMW_ABORT ||\n        ops[i].op_meta.state == ST_RMW_COMPLETE ||\n        ops[i].op_meta.state == ST_OP_MEMBERSHIP_COMPLETE ||\n        ops[i].op_meta.state == ST_GET_COMPLETE) {\n      if (first_iter_has_passed[worker_lid] != 0) {\n        if (ENABLE_REQ_PRINTS && worker_lid < MAX_THREADS_TO_PRINT)\n          colored_printf(\n              GREEN,\n              \"W%d--> Key Hash:%\" PRIu64\n              \"\\n\\t\\tType: %s, version %d, tie-b: %d, value(len-%d): %c\\n\",\n              worker_lid, ((uint64_t*)&ops[i].op_meta.key)[0],\n              code_to_str(ops[i].op_meta.state), ops[i].op_meta.ts.version,\n              ops[i].op_meta.ts.tie_breaker_id, ops[i].op_meta.val_len,\n              ops[i].value[0]);\n\n        /// Stats\n        if (ops[i].op_meta.state != ST_MISS) {\n          if (ops[i].op_meta.state != ST_RMW_ABORT)\n            w_stats[worker_lid].completed_ops_per_worker +=\n                ENABLE_COALESCE_OF_HOT_REQS ? ops[i].no_coales : 1;\n        } else\n          w_stats[worker_lid].reqs_missed_in_kvs++;\n\n        if (ops[i].op_meta.state == ST_PUT_COMPLETE)\n          w_stats[worker_lid].completed_wrs_per_worker++;\n        else if (ops[i].op_meta.state == ST_RMW_COMPLETE)\n          w_stats[worker_lid].completed_rmws_per_worker++;\n        else if (ops[i].op_meta.state == ST_RMW_ABORT)\n          w_stats[worker_lid].aborted_rmws_per_worker++;\n\n        // reset op bucket\n        ops[i].no_coales = 1;\n        ops[i].op_meta.state = ST_EMPTY;\n        ops[i].op_meta.opcode = ST_EMPTY;\n        refilled_per_ops_debug_cnt[i] = 0;\n        refilled_ops++;\n      }\n\n      if (ENABLE_ASSERTIONS)\n        assert(trace[*trace_iter].opcode == ST_OP_PUT ||\n               trace[*trace_iter].opcode == ST_OP_RMW ||\n               trace[*trace_iter].opcode == ST_OP_GET);\n\n      if (machine_id == 0 && worker_lid == worker_measuring_latency && i == 0)\n        start_latency_measurement(start);\n\n      /// INSERT new req(s) to ops\n      uint8_t key_id;\n      if (ENABLE_COALESCE_OF_HOT_REQS &&\n          trace[*trace_iter].opcode != ST_OP_RMW) {\n        // see if you could coalesce any requests\n        spacetime_op_t** n_hottest_keys_in_ops;\n        do {\n          key_id = trace[*trace_iter].key_id;\n          n_hottest_keys_in_ops = trace[*trace_iter].opcode == ST_OP_GET\n                                      ? n_hottest_keys_in_ops_get\n                                      : n_hottest_keys_in_ops_put;\n          // if we can coalesce (a hot) req\n          if (key_id < COALESCE_N_HOTTEST_KEYS &&  // is a hot key\n              n_hottest_keys_in_ops[key_id] !=\n                  NULL &&  // exists in the ops array\n              n_hottest_keys_in_ops[key_id]->op_meta.opcode ==\n                  trace[*trace_iter]\n                      .opcode)  // has the same code with the last inserted\n          {\n            n_hottest_keys_in_ops[key_id]->no_coales++;\n            *trace_iter =\n                trace[*trace_iter + 1].opcode != NOP ? *trace_iter + 1 : 0;\n          } else\n            break;\n        } while (1);\n\n        if (key_id < COALESCE_N_HOTTEST_KEYS)\n          n_hottest_keys_in_ops[key_id] = &ops[i];\n      }\n\n      ops[i].op_meta.state = ST_NEW;\n      ops[i].op_meta.opcode =\n          (uint8_t)(CR_ENABLE_ALL_NODES_GETS_EXCEPT_HEAD && machine_id != 0\n                        ? ST_OP_GET\n                        : trace[*trace_iter].opcode);\n      memcpy(&ops[i].op_meta.key, &trace[*trace_iter].key_hash,\n             sizeof(spacetime_key_t));\n\n      if (ops[i].op_meta.opcode == ST_OP_PUT ||\n          ops[i].op_meta.opcode == ST_OP_RMW)\n        memset(ops[i].value, ((uint8_t)'a' + machine_id), ST_VALUE_SIZE);\n\n      else if (ENABLE_READ_COMPLETE_AFTER_VAL_RECV_OF_HOT_REQS) {\n        // if its a read reset the timestamp\n        ops[i].op_meta.ts.version = 0;\n        ops[i].op_meta.ts.tie_breaker_id = 0;\n      }\n\n      ops[i].RMW_flag = ops[i].op_meta.opcode == ST_OP_RMW ? 1 : 0;\n\n      ops[i].op_meta.val_len = (uint8)(ops[i].op_meta.opcode == ST_OP_GET\n                                           ? 0\n                                           : ST_VALUE_SIZE >> SHIFT_BITS);\n\n      // instead of MOD add\n      *trace_iter = trace[*trace_iter + 1].opcode != NOP ? *trace_iter + 1 : 0;\n\n      if (ENABLE_REQ_PRINTS && worker_lid < MAX_THREADS_TO_PRINT)\n        colored_printf(RED, \"W%d--> Op: %s, hash(1st 8B):%\" PRIu64 \"\\n\",\n                       worker_lid, code_to_str(ops[i].op_meta.opcode),\n                       ((uint64_t*)&ops[i].op_meta.key)[0]);\n\n    } else\n      refilled_per_ops_debug_cnt[i]++;\n  }\n\n  if (refilled_ops == 0) w_stats[worker_lid].wasted_loops++;\n\n  if (first_iter_has_passed[worker_lid] == 0)\n    first_iter_has_passed[worker_lid] = 1;\n\n  if (ENABLE_ASSERTIONS)\n    for (int i = 0; i < max_batch_size; i++)\n      assert(ops[i].op_meta.opcode == ST_OP_PUT ||\n             ops[i].op_meta.opcode == ST_OP_GET ||\n             (ops[i].op_meta.opcode == ST_OP_RMW && is_CR == 0));\n\n  return node_suspected;\n}\n#endif  // HERMES_INLINE_UTIL_H\n"
  },
  {
    "path": "include/hermes/spacetime.h",
    "content": "//\n// Created by akatsarakis on 04/05/18.\n//\n\n#ifndef HERMES_SPACETIME_H\n#define HERMES_SPACETIME_H\n\n// Optik Options\n#ifndef CORE_NUM\n#define DEFAULT\n#define CORE_NUM 8\n#endif\n\n#include \"../utils/bit_vector.h\"\n#include \"../utils/concur_ctrl.h\"\n#include \"config.h\"\n#include \"hrd.h\"\n#include \"mica.h\"\n\n#define SPACETIME_NUM_KEYS (1000 * 1000)\n#define SPACETIME_NUM_BKTS (2 * 1024 * 1024)\n#define SPACETIME_LOG_CAP (1024 * 1024 * 1024)\n\n//#define SPACETIME_NUM_KEYS (60 * 1000 * 1000)\n//#define SPACETIME_NUM_BKTS (64 * 1024 * 1024)\n//#define SPACETIME_LOG_CAP  (4 * ((unsigned long long) M_1024)) //(1024 * 1024\n//* 1024)\n\n#define ST_VALUE_SIZE (KVS_VALUE_SIZE - sizeof(spacetime_object_meta))\n\n// Special EMPTY opcodes\n#define NOP 150                   // trace\n#define LAST_WRITER_ID_EMPTY 127  // 255\n#define ST_OP_BUFFER_INDEX_EMPTY 255\n\n/////////////////////////////////////////////\n//// ENUMS\n/////////////////////////////////////////////\n/// WARNING the monotonically increasing assigned numbers to States are used for\n/// comparisons (do not reorder / change numbers)\n// States\ntypedef enum {\n  VALID_STATE = 1,\n  INVALID_STATE,\n  INVALID_WRITE_STATE,\n  WRITE_STATE,\n  REPLAY_STATE,\n} __attribute__((packed)) hermes_states_t;\n\n// Input Opcodes\ntypedef enum {\n  ST_OP_GET = 111,\n  ST_OP_PUT,\n  ST_OP_RMW,\n  ST_OP_INV,\n  ST_OP_ACK,\n  ST_OP_VAL,\n  ST_OP_CRD,\n  ST_OP_MEMBERSHIP_CHANGE,\n  ST_OP_MEMBERSHIP_COMPLETE  // 119\n\n} __attribute__((packed)) input_opcodes_t;\n\n// Response Opcodes\ntypedef enum {\n  ST_GET_COMPLETE = 121,\n  ST_PUT_SUCCESS,     // broadcast invs\n  ST_REPLAY_SUCCESS,  // broadcast invs\n  ST_INV_SUCCESS,     // send ack\n  ST_ACK_SUCCESS,\n  ST_LAST_ACK_SUCCESS,           // complete local write\n  ST_LAST_ACK_NO_BCAST_SUCCESS,  // complete local write\n  ST_PUT_COMPLETE,               // broadcast invs\n  ST_VAL_SUCCESS,                // 129\n\n  ST_MISS,  // 130\n  ST_GET_STALL,\n  ST_PUT_STALL,\n  ST_PUT_COMPLETE_SEND_VALS,\n  ST_SEND_CRD,  // 134\n\n  // RMW opcodes\n  ST_RMW_SUCCESS,  // 135\n  ST_RMW_STALL,\n  ST_RMW_COMPLETE,\n  ST_RMW_ABORT,\n  ST_OP_INV_ABORT,  // 139 //send inv instead of ACK\n\n} __attribute__((packed)) response_opcodes_t;\n\n// ops bucket states\ntypedef enum {\n  ST_EMPTY = 140,\n  ST_NEW,\n  ST_COMPLETE,\n  ST_IN_PROGRESS_PUT,\n  ST_IN_PROGRESS_REPLAY,\n  ST_REPLAY_COMPLETE,\n  ST_IN_PROGRESS_GET,  // Used only in Chain Replication\n  ST_REPLAY_COMPLETE_SEND_VALS,\n  ST_IN_PROGRESS_RMW,\n  ST_RMW_COMPLETE_SEND_VALS  // 149\n} __attribute__((packed)) op_bucket_states_t;\n\n// failure detection (deprecated)\ntypedef enum {\n  ST_OP_HEARTBEAT = 151,  // WARNING: 150 opcode is used (see NOP define)!!\n  ST_OP_SUSPICION,\n  ST_INV_OUT_OF_GROUP\n} __attribute__((packed)) fs_ops_t;\n\n// receive_buff_types\ntypedef enum {\n  ST_INV_BUFF = 161,\n  ST_ACK_BUFF,\n  ST_VAL_BUFF,\n  ST_CRD_BUFF\n} __attribute__((packed)) rcv_buff_types_t;\n\n/////////////////////////////////////////////\n//// Hermes(msg and KV -- spacetime) structs\n/////////////////////////////////////////////\n\n// Fixed-size 8 (or 16) byte keys\ntypedef struct {\n  //    uint64 __unused; // This should be 8B ////// Uncomment this for\n  //    fixed-size 16 byte keys instead\n  uint64_t bkt : 48;\n  unsigned int tag : 16;\n} spacetime_key_t;\n\ntypedef volatile struct {\n  hermes_states_t state;\n  bit_vector_t ack_bv;\n  uint8_t RMW_flag : 1;\n  uint8_t last_writer_id : 7;\n  uint8_t op_buffer_index;  // TODO change to uint16_t for a buffer >= 256\n  conc_ctrl_t cctrl;\n  timestamp_t last_local_write_ts;\n} spacetime_object_meta;\n\ntypedef struct {\n  spacetime_key_t key; /* This must be the 1st field and 8B or 16B aligned */\n  uint8_t opcode;      // both recv / resp //TODO create a union\n  union {\n    uint8_t state;      // HERMES:  used by spacetime_op_t\n    uint8_t sender;     // HERMES:  used by spacetime_inv/ack/val_t\n    uint8_t initiator;  // CR:  used by spacetime_inv/ack\n  };\n  union {\n    uint8_t val_len;   // HERMES: unused for spacetime_ack_t and spacetime_val_t\n                       // (align for using a single memcpy)\n    uint8_t buff_idx;  //    CR: used   for spacetime_ack_t buffer index of\n                       //    write initiated this req\n  };\n  timestamp_t ts;\n} spacetime_op_meta_t, spacetime_ack_t, spacetime_val_t;\n\ntypedef struct {\n  spacetime_op_meta_t op_meta;  // op_t/inv_t: uses the state/sender part of the\n                                // op_meta union (not sender/state)\n  union {\n    struct {                    // Hermes struct\n      uint8_t RMW_flag : 1;     // 1 indicates RMWs while 0 normal writes\n      uint16_t no_coales : 15;  // used only for skew optimizations\n    };\n    struct {              // CR struct\n      uint8_t buff_idx;   //    for spacetime_inv_t buffer index of write\n                          //    initiated this req\n      uint8_t initiator;  //    for spacetime_inv_t buffer index of write\n                          //    initiated this req\n    };\n  };\n  uint8_t value[ST_VALUE_SIZE];\n} spacetime_op_t, spacetime_inv_t;\n\ntypedef struct {\n  volatile uint8_t num_of_alive_remotes;\n  volatile bit_vector_t g_membership;\n  volatile bit_vector_t w_ack_init;\n  seqlock_t lock;\n} spacetime_group_membership;\n\nstruct spacetime_kv {\n  // TODO may add kvs stats\n  struct mica_kv hash_table;\n};\n\nstruct spacetime_trace_command {\n  spacetime_key_t key_hash;\n  uint8_t opcode;\n  uint8_t key_id;  // stores key ids 0-254 otherwise it is set to 255 to\n                   // indicate other key ids\n};\n\nvoid spacetime_init(int spacetime_id);\nvoid spacetime_populate_fixed_len(struct spacetime_kv* kv, int n, int val_len);\n\n///////////////////////////////////////\n//////////////////// Hermes\n///////////////////////////////////////\n\nenum hermes_batch_type_t {\n  local_ops,\n  local_ops_after_membership_change,\n  invs,\n  acks,\n  vals\n};\n\nvoid hermes_batch_ops_to_KVS(enum hermes_batch_type_t type, uint8_t* op_array,\n                             int op_num, uint16_t sizeof_op_elem,\n                             spacetime_group_membership curr_membership,\n                             int* node_suspected,\n                             spacetime_op_t* read_write_ops, uint8_t thread_id);\n\n///////////////////////////////////////\n//////////////////// CR(AQ)\n///////////////////////////////////////\nenum cr_type_t {\n  Local_ops,      // All nodes\n  Remote_writes,  // Head\n  Remote_reads,   // Tail\n  Invs,           // All except Head\n  Acks            // All except Tail\n};\n\nvoid cr_batch_ops_to_KVS(enum cr_type_t cr_type, uint8_t* op_array, int op_num,\n                         uint16_t sizeof_op_elem,\n                         spacetime_op_t* read_write_op);\n\n///////////////////////////////////////\n//////////////////// Helpers\n///////////////////////////////////////\nstatic inline uint8_t\nis_last_ack(bit_vector_t gathered_acks,\n            spacetime_group_membership curr_g_membership)\n{\n  bv_and(&gathered_acks, curr_g_membership.g_membership);\n  return bv_are_equal(gathered_acks, curr_g_membership.g_membership);\n}\n\n// TODO: adapt and use the following functions to re-enable variable length\n// object support\nstatic inline uint8_t\nget_val_len(struct mica_op* op_t)\n{\n  return (op_t->val_len >> SHIFT_BITS) - sizeof(spacetime_op_meta_t);\n}\n\nstatic inline uint8_t\nset_val_len(spacetime_op_meta_t* op_t)\n{\n  return (op_t->val_len >> SHIFT_BITS) + sizeof(spacetime_op_meta_t);\n}\n\nextern struct spacetime_kv kv;\nextern spacetime_group_membership group_membership;\n\n#endif  // HERMES_SPACETIME_H\n"
  },
  {
    "path": "include/hermes/util.h",
    "content": "//\n// Created by akatsarakis on 15/03/18.\n//\n\n#ifndef HERMES_UTIL_H\n#define HERMES_UTIL_H\n\n#include <stdint.h>\n#include <stdio.h>\n#include <time.h>\n#include \"config.h\"\n#include \"hrd.h\"\n#include \"spacetime.h\"\n\nstruct worker_stats {\n  long long completed_ops_per_worker;\n  long long completed_wrs_per_worker;\n  long long completed_rmws_per_worker;\n  long long aborted_rmws_per_worker;\n  long long reqs_missed_in_kvs;\n\n  long long issued_invs_per_worker;\n  long long issued_acks_per_worker;\n  long long issued_vals_per_worker;\n  long long issued_crds_per_worker;\n\n  long long issued_packet_invs_per_worker;\n  long long issued_packet_acks_per_worker;\n  long long issued_packet_vals_per_worker;\n  long long issued_packet_crds_per_worker;\n\n  long long inv_ss_completions_per_worker;\n  long long ack_ss_completions_per_worker;\n  long long val_ss_completions_per_worker;\n  long long crd_ss_completions_per_worker;\n\n  long long received_invs_per_worker;\n  long long received_acks_per_worker;\n  long long received_vals_per_worker;\n  long long received_crds_per_worker;\n\n  long long received_packet_invs_per_worker;\n  long long received_packet_acks_per_worker;\n  long long received_packet_vals_per_worker;\n  long long received_packet_crds_per_worker;\n\n  long long received_acks_stalled;  // for faking tail-latency\n\n  long long stalled_time_per_worker;\n\n  long long wasted_loops;\n  long long total_loops;\n  double empty_reqs_per_trace;\n  long long cold_keys_per_trace;\n  double tot_empty_reqs_per_trace;\n};\n\nstruct stats {\n  double xput_per_worker[MAX_WORKERS_PER_MACHINE];\n  double rmw_xput_per_worker[MAX_WORKERS_PER_MACHINE];\n  double rmw_abort_rate_per_worker[MAX_WORKERS_PER_MACHINE];\n\n  double issued_invs_avg_coalesing[MAX_WORKERS_PER_MACHINE];\n  double issued_acks_avg_coalesing[MAX_WORKERS_PER_MACHINE];\n  double issued_vals_avg_coalesing[MAX_WORKERS_PER_MACHINE];\n  double issued_crds_avg_coalesing[MAX_WORKERS_PER_MACHINE];\n\n  double received_invs_avg_coalesing[MAX_WORKERS_PER_MACHINE];\n  double received_acks_avg_coalesing[MAX_WORKERS_PER_MACHINE];\n  double received_vals_avg_coalesing[MAX_WORKERS_PER_MACHINE];\n  double received_crds_avg_coalesing[MAX_WORKERS_PER_MACHINE];\n\n  double percentage_of_wasted_loops[MAX_WORKERS_PER_MACHINE];\n  double completed_reqs_per_loop[MAX_WORKERS_PER_MACHINE];\n\n  //\tlong long issued_packet_acks_per_worker;\n  double batch_size_per_worker[MAX_WORKERS_PER_MACHINE];\n  double empty_reqs_per_worker[MAX_WORKERS_PER_MACHINE];\n  double stalled_time_per_worker[MAX_WORKERS_PER_MACHINE];\n  double average_coalescing_per_worker[MAX_WORKERS_PER_MACHINE];\n\n  double acks_per_worker[MAX_WORKERS_PER_MACHINE];\n  double invs_per_worker[MAX_WORKERS_PER_MACHINE];\n  double updates_per_worker[MAX_WORKERS_PER_MACHINE];\n\n  double write_ratio_per_worker[MAX_WORKERS_PER_MACHINE];\n};\n\n// init all stats to 0\nstatic inline void\ninit_stats(struct worker_stats* w_stats)\n{\n  memset(w_stats, 0, sizeof(struct worker_stats) * MAX_WORKERS_PER_MACHINE);\n}\n\nvoid trace_init(struct spacetime_trace_command** trace, uint16_t worker_lid);\nvoid* run_worker(void* arg);\nvoid* print_stats_thread(void* no_arg);\nvoid dump_latency_stats(void);\n\n// Maybe inline these\nuint8_t is_state_code(uint8_t code);\nuint8_t is_input_code(uint8_t code);\nuint8_t is_response_code(uint8_t code);\nuint8_t is_bucket_state_code(uint8_t code);\n\nint spawn_stats_thread(void);\nchar* code_to_str(uint8_t code);\n\nvoid setup_kvs_buffs(spacetime_op_t** ops, spacetime_inv_t** inv_recv_ops,\n                     spacetime_ack_t** ack_recv_ops,\n                     spacetime_val_t** val_recv_ops);\n\nextern dbit_vector_t* g_share_qs_barrier;\nextern volatile struct worker_stats w_stats[MAX_WORKERS_PER_MACHINE];\n#endif  // HERMES_UTIL_H\n"
  },
  {
    "path": "include/mica-herd/city.h",
    "content": "// city.h - cityhash-c\n// CityHash on C\n// Copyright (c) 2011-2012, Alexander Nusov\n//\n// - original copyright notice -\n// Copyright (c) 2011 Google, Inc.\n//\n// Permission is hereby granted, free of charge, to any person obtaining a copy\n// of this software and associated documentation files (the \"Software\"), to deal\n// in the Software without restriction, including without limitation the rights\n// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n// copies of the Software, and to permit persons to whom the Software is\n// furnished to do so, subject to the following conditions:\n//\n// The above copyright notice and this permission notice shall be included in\n// all copies or substantial portions of the Software.\n//\n// THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n// THE SOFTWARE.\n//\n// CityHash, by Geoff Pike and Jyrki Alakuijala\n//\n// This file provides a few functions for hashing strings. On x86-64\n// hardware in 2011, CityHash64() is faster than other high-quality\n// hash functions, such as Murmur.  This is largely due to higher\n// instruction-level parallelism.  CityHash64() and CityHash128() also perform\n// well on hash-quality tests.\n//\n// CityHash128() is optimized for relatively long strings and returns\n// a 128-bit hash.  For strings more than about 2000 bytes it can be\n// faster than CityHash64().\n//\n// Functions in the CityHash family are not suitable for cryptography.\n//\n// WARNING: This code has not been tested on big-endian platforms!\n// It is known to work well on little-endian platforms that have a small penalty\n// for unaligned reads, such as current Intel and AMD moderate-to-high-end CPUs.\n//\n// By the way, for some hash functions, given strings a and b, the hash\n// of a+b is easily derived from the hashes of a and b.  This property\n// doesn't hold for any hash functions in this file.\n\n#ifndef CITY_HASH_H_\n#define CITY_HASH_H_\n\n#include <stdint.h>\n#include <stdlib.h>\n\ntypedef uint8_t uint8;\ntypedef uint32_t uint32;\ntypedef uint64_t uint64;\n\ntypedef struct _uint128 uint128;\nstruct _uint128 {\n  uint64 first;\n  uint64 second;\n};\n\n#define Uint128Low64(x) (x).first\n#define Uint128High64(x) (x).second\n\n// Hash function for a byte array.\nuint64 CityHash64(const char* buf, size_t len);\n\n// Hash function for a byte array.  For convenience, a 64-bit seed is also\n// hashed into the result.\nuint64 CityHash64WithSeed(const char* buf, size_t len, uint64 seed);\n\n// Hash function for a byte array.  For convenience, two seeds are also\n// hashed into the result.\nuint64 CityHash64WithSeeds(const char* buf, size_t len, uint64 seed0,\n                           uint64 seed1);\n\n// Hash function for a byte array.\nuint128 CityHash128(const char* s, size_t len);\n\n// Hash function for a byte array.  For convenience, a 128-bit seed is also\n// hashed into the result.\nuint128 CityHash128WithSeed(const char* s, size_t len, uint128 seed);\n\n#endif  // CITY_HASH_H_\n"
  },
  {
    "path": "include/mica-herd/hrd.h",
    "content": "#ifndef HRD_H\n#define HRD_H\n\n#include <assert.h>\n#include <errno.h>\n#include <numaif.h>\n#include <stdarg.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include <sys/ipc.h>\n#include <sys/shm.h>\n#include <sys/time.h>\n#include <sys/types.h>\n#include <unistd.h>\n\n#include <infiniband/verbs.h>\n#include <libmemcached/memcached.h>\n#include <malloc.h>\n#include <time.h>\n#include \"sizes.h\"\n\n//<vasilis> Multicast\n// TODO we do not use hw multicast because it helps only on master-based\n// patterns\n//#include <rdma/rdma_cma.h>\n#include <arpa/inet.h>\n#include <byteswap.h>\n#include <netdb.h>\n#include <netinet/in.h>\n#include <sys/socket.h>\n// <vasilis>\n\n#define USE_BIG_OBJECTS 0\n#define EXTRA_CACHE_LINES 0\n#define BASE_VALUE_SIZE 46  // max is --> 46\n#define SHIFT_BITS \\\n  (USE_BIG_OBJECTS == 1 ? 3 : 0)  // number of bits to shift left or right to\n                                  // calculate the value length\n#define HRD_DEFAULT_PSN \\\n  3185 /* PSN for all queues */  // starting Packet Sequence Number\n#define HRD_DEFAULT_QKEY 0x11111111\n\n#define HRD_QP_NAME_SIZE 200 /* Size (in bytes) of a queue pair name */\n#define HRD_RESERVED_NAME_PREFIX \"__HRD_RESERVED_NAME_PREFIX\"\n\n#define KVS_VALUE_SIZE                                \\\n  (USE_BIG_OBJECTS == 1                               \\\n       ? ((EXTRA_CACHE_LINES * 64) + BASE_VALUE_SIZE) \\\n       : BASE_VALUE_SIZE)  //(169 + 64)// 46 + 64 + 64//32 //(46 + 64)\n\n#define HUGE_PAGE_SIZE 2097152\n#define LEVERAGE_TLB_COALESCING 1\n\n/*\n * Small max_inline_data reduces the QP's max WQE size, which reduces the\n * DMA size in doorbell method of WQE fetch.\n */\n#define HRD_MAX_INLINE \\\n  188  //(USE_BIG_OBJECTS == 1 ? ((EXTRA_CACHE_LINES * 64) + 60) : 60) //60 is\n       // what kalia had here//\n\n// This is required for ROCE not sure yet why\n// <vasilis>\n#define IB_PHYS_PORT 1\n// </vasilis>\n// <akatsarakis>\n#define USE_HUGE_PAGES 1\n// </akatsarakis>\n\n#ifndef likely\n#define likely(x) __builtin_expect(!!(x), 1)\n#endif\n\n#ifndef unlikely\n#define unlikely(x) __builtin_expect(!!(x), 0)\n#endif\n\n/* Compare, print, and exit */\n#define CPE(val, msg, err_code)                \\\n  if (unlikely(val)) {                         \\\n    fprintf(stderr, msg);                      \\\n    fprintf(stderr, \" Error %d \\n\", err_code); \\\n    exit(err_code);                            \\\n  }\n\n/* vasilis added a ceiling and a MAX*/\n#define CEILING(x, y) (((x) + (y)-1) / (y))\n#define MAX(x, y) (x > y ? x : y)\n\nint is_roce;\nint machine_id;\nchar *remote_IP, *local_IP;\n\n/* Registry info about a QP */\nstruct hrd_qp_attr {\n  char name[HRD_QP_NAME_SIZE];\n\n  // ROCE\n  uint64_t\n      gid_global_interface_id;  // Store the gid fields separately because I\n  uint64_t gid_global_subnet_prefix;  // don't like unions. Needed for RoCE only\n\n  /* Info about the RDMA buffer associated with this QP */\n  uintptr_t buf_addr;\n  uint32_t buf_size;\n  uint32_t rkey;\n\n  int lid;\n  int qpn;\n  uint8_t sl;\n};\n\nstruct hrd_ud_ctrl_blk {\n  int local_hid; /* Local ID on the machine this process runs on */\n\n  /* Info about the device/port to use for this control block */\n  struct ibv_context* ctx;\n  int device_id;    /* Resovled by libhrd from @port_index */\n  int dev_port_id;  /* 1-based within dev @device_id. Resolved by libhrd */\n  int numa_node_id; /* NUMA node id */\n\n  struct ibv_pd* pd; /* A protection domain for this control block */\n\n  /* Datagram QPs */\n  int num_dgram_qps;\n  struct ibv_qp** dgram_qp;\n  struct ibv_cq **dgram_send_cq, **dgram_recv_cq;\n  volatile uint8_t* dgram_buf; /* A buffer for RECVs on dgram QPs */\n  int* recv_q_depth;\n  int* send_q_depth;\n  int dgram_buf_shm_key;\n  struct ibv_mr* dgram_buf_mr;\n};\n\n/* Major initialzation functions */\n\nstruct hrd_ud_ctrl_blk* hrd_ud_ctrl_blk_init(\n    int local_hid, int port_index,\n    int numa_node_id, /* -1 means don't use hugepages */\n    int num_dgram_qps, int dgram_buf_size, int dgram_buf_shm_key,\n    int* recv_q_depth, int* send_q_depth);\n\nint hrd_ud_ctrl_blk_destroy(struct hrd_ud_ctrl_blk* cb);\n\n/* RDMA resolution functions */\nstruct ibv_device* hrd_resolve_port_index(struct hrd_ud_ctrl_blk* cb,\n                                          int port_index);\n\nuint16_t hrd_get_local_lid(struct ibv_context* ctx, int port_id);\n\nvoid hrd_create_dgram_qps(struct hrd_ud_ctrl_blk* cb);\n\n/* Fill @wc with @num_comps comps from this @cq. Exit on error. */\nstatic inline uint32_t\nhrd_poll_cq(struct ibv_cq* cq, int num_comps, struct ibv_wc* wc)\n{\n  int comps = 0;\n  uint32_t debug_cnt = 0;\n  while (comps < num_comps) {\n    if (debug_cnt > M_256) {\n      printf(\"Someone is stuck waiting for a completion %d / %d  \\n\", comps,\n             num_comps);\n      debug_cnt = 0;\n    }\n    int new_comps = ibv_poll_cq(cq, num_comps - comps, &wc[comps]);\n    if (new_comps != 0) {\n      // printf(\"I see completions %d\\n\", new_comps);\n      /* Ideally, we should check from comps -> new_comps - 1 */\n      if (wc[comps].status != 0) {\n        fprintf(stderr, \"Bad wc status %d\\n\", wc[comps].status);\n        exit(0);\n      }\n      comps += new_comps;\n    }\n    debug_cnt++;\n  }\n  return debug_cnt;\n}\n\nstatic inline struct ibv_mr*\nregister_buffer(struct ibv_pd* pd, void* buf, uint32_t size)\n{\n  int ib_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ |\n                 IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC;\n  struct ibv_mr* mr = ibv_reg_mr(pd, (char*)buf, size, ib_flags);\n  assert(mr != NULL);\n  return mr;\n}\n\n/* Registry functions */\nvoid hrd_publish(const char* key, void* value, int len);\nint hrd_get_published(const char* key, void** value);\n\n///* Publish the nth connected queue pair from this cb with this name */\n// void hrd_publish_conn_qp(struct hrd_ud_ctrl_blk *cb, int n, const char\n// *qp_name);\n\n/* Publish the nth datagram queue pair from this cb with this name */\nvoid hrd_publish_dgram_qp(struct hrd_ud_ctrl_blk* cb, int n,\n                          const char* qp_name, uint8_t sl);\n\nstruct hrd_qp_attr* hrd_get_published_qp(const char* qp_name);\n\n/* Utility functions */\nstatic inline uint32_t\nhrd_fastrand(uint64_t* seed)\n{\n  *seed = *seed * 1103515245 + 12345;\n  return (uint32_t)(*seed >> 32);\n}\n\nvoid* hrd_malloc_socket(int shm_key, uint64_t size, int socket_id);\nint hrd_free(int shm_key, void* shm_buf);\nchar* hrd_getenv(const char* name);\n\n// Like printf, but colorfur. Limited to 1000 characters.\ntypedef enum { YELLOW = 0, RED, GREEN, CYAN } color_print_t;\nvoid colored_printf(color_print_t color, const char* format, ...);\n\nextern char dev_name[50];\n#endif /* HRD_H */\n"
  },
  {
    "path": "include/mica-herd/mica.h",
    "content": "#ifndef MICA_H\n#define MICA_H\n\n#include <stdint.h>\n#include \"city.h\"\n#include \"hrd.h\"\n\n/*\n * The polling logic in HERD requires the following:\n * 1. 0 < MICA_OP_GET < MICA_OP_PUT < HERD_OP_GET < HERD_OP_PUT\n * 2. HERD_OP_GET = MICA_OP_GET + HERD_MICA_OFFSET\n * 3. HERD_OP_PUT = MICA_OP_PUT + HERD_MICA_OFFSET\n *\n * This allows us to detect HERD requests by checking if the request region\n * opcode is more than MICA_OP_PUT. And then we can convert a HERD opcode to\n * a MICA opcode by subtracting HERD_MICA_OFFSET from it.\n */\n#define MICA_OP_PUT 112\n\n/* Ensure that a mica_op is cacheline aligned */\n#define MICA_OP_METADATA \\\n  (sizeof(struct mica_key) + sizeof(uint8_t) + sizeof(uint8_t))\n#define MICA_MIN_VALUE (64 - MICA_OP_METADATA)\n#define MICA_MAX_VALUE                                                \\\n  (USE_BIG_OBJECTS == 1 ? (MICA_MIN_VALUE + (EXTRA_CACHE_LINES * 64)) \\\n                        : MICA_MIN_VALUE)\n\n#define MICA_LOG_BITS 40\n\n#define MICA_INDEX_SHM_KEY 3185\n#define MICA_LOG_SHM_KEY 4185\n\n/*\n * Debug values:\n * 0: No safety checks on fast path\n * 1: Sanity checks for arguments\n * 2: Pretty print GET/PUT operations\n */\n\n#define MICA_DEBUG 0\n\nstruct mica_resp {\n  uint8_t type;\n  uint8_t val_len;\n  uint16_t unused[3]; /* Make val_ptr 8-byte aligned */\n  uint8_t* val_ptr;\n};\n\n/* Fixed-size 16 byte keys */\nstruct mica_key {\n  unsigned long long __unused : 64;\n  unsigned int bkt : 32;\n  unsigned int server : 16;\n  unsigned int tag : 16;\n};\n\nstruct mica_op {\n  struct mica_key key; /* This must be the 1st field and 16B aligned */\n  uint8_t opcode;\n  uint8_t val_len;\n  uint8_t value[MICA_MAX_VALUE];\n};\n\nstruct mica_slot {\n  uint32_t in_use : 1;\n  uint32_t tag : (64 - MICA_LOG_BITS - 1);\n  uint64_t offset : MICA_LOG_BITS;\n};\n\nstruct mica_bkt {\n  struct mica_slot slots[8];\n};\n\nstruct mica_kv {\n  struct mica_bkt* ht_index;\n  uint8_t* ht_log;\n\n  /* Metadata */\n  int instance_id; /* ID of this MICA instance. Used for shm keys */\n\n  uint64_t num_bkts; /* Number of buckets requested by user */\n  uint64_t bkt_mask; /* Mask down from a mica_key's @bkt to a bucket */\n\n  uint64_t log_cap;  /* Capacity of circular log in bytes */\n  uint64_t log_mask; /* Mask down from a slot's @offset to a log offset */\n\n  /* State */\n  uint64_t log_head;\n\n  /* Stats */\n  long long num_insert_op;       /* Number of PUT requests executed */\n  long long num_index_evictions; /* Number of entries evicted from index */\n};\n\nvoid mica_init(struct mica_kv* kv, int instance_id, int node_id, int num_bkts,\n               uint64_t log_cap);\n\n/* Single-key INSERT */\nvoid mica_insert_one(struct mica_kv* kv, struct mica_op* op,\n                     struct mica_resp* res);\n\n/* Helpers */\nuint128* mica_gen_keys(int n);\n\n///* Debug functions */\nvoid mica_print_op(struct mica_op* op);\n\n#endif\n"
  },
  {
    "path": "include/mica-herd/sizes.h",
    "content": "#define K_32 32768\n\n#define K_64 65536\n\n#define K_128 131072\n#define K_128_ 131071\n\n#define K_256 262144\n#define K_256_ 262143\n\n#define K_512 524288\n#define K_512_ 524287\n\n#define M_1 1048576\n#define M_1_ 1048575\n\n#define M_2 2097152\n#define M_2_ 2097151\n\n#define M_4 4194304\n#define M_4_ 4194303\n\n#define M_8 8388608\n#define M_8_ 8388607\n\n#define M_16 16777216\n#define M_16_ 16777215\n\n#define M_32 33554432\n#define M_32_ 33554431\n\n#define M_128 134217728\n#define M_128_ 134217727\n\n#define M_256 268435456\n#define M_256_ 268435455\n\n#define M_512 536870912\n#define M_512_ 536870911\n\n#define M_1024 1073741824\n#define M_1024_ 1073741823\n\n#define MILLION 1000000\n"
  },
  {
    "path": "include/utils/bit_vector.h",
    "content": "//\n// Created by akatsarakis on 11/12/18.\n//\n\n#ifndef HERMES_BIT_VECTOR_H\n#define HERMES_BIT_VECTOR_H\n\n#include <assert.h>\n#include <stdint.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n\n// Change accordingly\n#define BV_BIT_VECTOR_SIZE \\\n  8  // Set if you use statical bit vector (bit_vector_t)\n#define BV_ENABLE_BIT_VECTOR_ASSERTS 1\n\n// Do not change the following defines\n#define BV_CEILING(x, y) (((x) + (y)-1) / (y))\n#define BV_BITS_IN_A_BYTE 8\n\n#define BV_BIT_VECTOR_SIZE_IN_BYTES \\\n  BV_CEILING(BV_BIT_VECTOR_SIZE, BV_BITS_IN_A_BYTE)\n\n#define BV_BIT_SLOT(bit) (bit / BV_BITS_IN_A_BYTE)\n#define BV_BIT_MOD(bit) ((uint8_t)1 << bit % BV_BITS_IN_A_BYTE)\n\n// print binary numbers\n#define BYTE_TO_BINARY_PATTERN \"%c%c%c%c%c%c%c%c\"\n#define BYTE_TO_BINARY(byte)                                \\\n  (byte & 0x80 ? '1' : '0'), (byte & 0x40 ? '1' : '0'),     \\\n      (byte & 0x20 ? '1' : '0'), (byte & 0x10 ? '1' : '0'), \\\n      (byte & 0x08 ? '1' : '0'), (byte & 0x04 ? '1' : '0'), \\\n      (byte & 0x02 ? '1' : '0'), (byte & 0x01 ? '1' : '0')\n\ntypedef struct {\n  uint8_t bit_array[BV_BIT_VECTOR_SIZE_IN_BYTES];\n} bit_vector_t;\n\ntypedef struct {\n  uint8_t bv_size;     // in bits\n  uint8_t* bit_array;  // bit_array len == ceil(bv_size / 8)\n} dbit_vector_t;\n\n// returns the least amount of bytes that required to store x bits\nstatic inline uint16_t\nbv_bits_to_bytes(uint16_t bits)\n{\n  return (uint16_t)BV_CEILING(bits, BV_BITS_IN_A_BYTE);\n}\n\n/////////////////////////////////////////\n/// Internal Bitvector API functions (should not be called directly)\n/////////////////////////////////////////\n\nstatic inline void\nbv_init_internal(uint8_t* bit_array, uint16_t size_in_bits)\n{\n  for (int i = 0; i < bv_bits_to_bytes(size_in_bits); ++i)\n    bit_array[i] = 0;\n}\n\nstatic inline uint8_t\nbv_bit_get_internal(const uint8_t* bit_array, uint16_t size_in_bits,\n                    uint8_t bit)\n{\n  if (BV_ENABLE_BIT_VECTOR_ASSERTS) assert(bit < size_in_bits);\n\n  return (uint8_t)((bit_array[BV_BIT_SLOT(bit)] & BV_BIT_MOD(bit)) == 0 ? 0\n                                                                        : 1);\n}\n\nstatic inline void\nbv_bit_set_internal(uint8_t* bit_array, uint16_t size_in_bits, uint8_t bit)\n{\n  if (BV_ENABLE_BIT_VECTOR_ASSERTS) assert(bit < size_in_bits);\n\n  bit_array[BV_BIT_SLOT(bit)] |= BV_BIT_MOD(bit);\n}\n\nstatic inline void\nbv_bit_reset_internal(uint8_t* bit_array, uint16_t size_in_bits, uint8_t bit)\n{\n  if (BV_ENABLE_BIT_VECTOR_ASSERTS) assert(bit < size_in_bits);\n\n  bit_array[BV_BIT_SLOT(bit)] &= ~(BV_BIT_MOD(bit));\n}\n\nstatic inline void\nbv_set_all_internal(uint8_t* bit_array, uint16_t size_in_bits)\n{\n  uint8_t bytes = (uint8_t)bv_bits_to_bytes(size_in_bits);\n  uint8_t unused = (uint8_t)(bytes * 8 - size_in_bits);\n  uint8_t last_byte = (uint8_t)(255 >> unused);\n\n  for (int i = 0; i < bytes - 1; ++i)\n    bit_array[i] = 255;\n\n  bit_array[bytes - 1] = last_byte;\n}\n\nstatic inline void\nbv_reset_all_internal(uint8_t* bit_array, uint16_t size_in_bits)\n{\n  for (int i = 0; i < bv_bits_to_bytes(size_in_bits); ++i)\n    bit_array[i] = 0;\n}\n\nstatic inline uint8_t\nbv_are_equal_internal(uint8_t* ba1, uint16_t size_in_bits1, uint8_t* ba2,\n                      uint16_t size_in_bits2)\n{\n  if (size_in_bits1 != size_in_bits2) return 0;\n\n  uint16_t size_in_bytes = bv_bits_to_bytes(size_in_bits1);\n\n  // shift the unused bits to avoid failing due to them\n  // (difference only in the unused bits)\n  uint8_t unused_ms_bits =\n      (uint8_t)(BV_BITS_IN_A_BYTE * size_in_bytes - size_in_bits1);\n  uint8_t last_byte1 = ba1[size_in_bytes - 1] << unused_ms_bits;\n  uint8_t last_byte2 = ba2[size_in_bytes - 1] << unused_ms_bits;\n\n  return (uint8_t)(memcmp(ba1, ba2, (size_t)(size_in_bytes - 1)) == 0 &&\n                           last_byte1 == last_byte2\n                       ? 1\n                       : 0);\n}\n\nstatic inline void\nbv_copy_internal(uint8_t* ba_dst, uint16_t size_in_bits_dst, uint8_t* ba_src,\n                 uint16_t size_in_bits_src)\n{\n  // allow copy only if sizes match\n  if (size_in_bits_dst != size_in_bits_src) assert(0);\n\n  memcpy(ba_dst, ba_src, bv_bits_to_bytes(size_in_bits_src));\n}\n\nstatic inline uint8_t\nbv_no_setted_bits_internal(uint8_t* bit_array, uint16_t size_in_bits)\n{\n  uint8_t cnt = 0;\n  for (uint8_t i = 0; i < size_in_bits; ++i)\n    cnt += bv_bit_get_internal(bit_array, size_in_bits, i);\n  return cnt;\n}\n\n/// Bitvector Bitwise ops internal\n\nstatic inline void\nbv_reverse_internal(uint8_t* bit_array, uint16_t size_in_bits)\n{\n  for (int i = 0; i < bv_bits_to_bytes(size_in_bits); ++i)\n    bit_array[i] = ~bit_array[i];\n}\n\nstatic inline void\nbv_and_internal(uint8_t* ba_dst, uint16_t size_in_bits_dst,\n                const uint8_t* ba_src, uint16_t size_in_bits_src)\n{\n  // allow and only if sizes match\n  if (size_in_bits_dst != size_in_bits_src) assert(0);\n\n  for (int i = 0; i < bv_bits_to_bytes(size_in_bits_dst); ++i)\n    ba_dst[i] &= ba_src[i];\n}\n\nstatic inline void\nbv_or_internal(uint8_t* ba_dst, uint16_t size_in_bits_dst,\n               const uint8_t* ba_src, uint16_t size_in_bits_src)\n{\n  // allow or only if sizes match\n  if (size_in_bits_dst != size_in_bits_src) assert(0);\n\n  for (int i = 0; i < bv_bits_to_bytes(size_in_bits_dst); ++i)\n    ba_dst[i] |= ba_src[i];\n}\n\n/// Bitvector Print functions\n\nstatic inline void\nbv_print_internal(const uint8_t* bit_array, uint16_t size_in_bits)\n{\n  for (int i = bv_bits_to_bytes(size_in_bits) - 1; i >= 0; --i)\n    printf(BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(bit_array[i]));\n}\n\nstatic inline void\nbv_print_enhanced_internal(const uint8_t* bit_array, uint16_t size_in_bits)\n{\n  printf(\"Bit vector: \");\n  bv_print_internal(bit_array, size_in_bits);\n  printf(\"\\n\");\n}\n\n/////////////////////////////////////////\n/// Dynamic Bitvector API functions\n/////////////////////////////////////////\nstatic inline void\ndbv_init(dbit_vector_t** bv, uint8_t size)\n{\n  uint16_t bv_size_in_bytes = bv_bits_to_bytes(size);\n  *bv = malloc(sizeof(dbit_vector_t));\n  (*bv)->bit_array = malloc(bv_size_in_bytes * sizeof(uint8_t));\n  (*bv)->bv_size = size;\n  bv_init_internal((*bv)->bit_array, size);\n}\n\nstatic inline void\ndbv_destroy(dbit_vector_t* bv)\n{\n  free(bv->bit_array);\n  free(bv);\n}\n\nstatic inline uint8_t\ndbv_bit_get(dbit_vector_t bv, int bit)\n{\n  return bv_bit_get_internal(bv.bit_array, bv.bv_size, bit);\n}\n\nstatic inline void\ndbv_bit_set(dbit_vector_t* bv, uint8_t bit)\n{\n  bv_bit_set_internal(bv->bit_array, bv->bv_size, bit);\n}\n\nstatic inline void\ndbv_bit_reset(dbit_vector_t* bv, uint8_t bit)\n{\n  bv_bit_reset_internal(bv->bit_array, bv->bv_size, bit);\n}\n\nstatic inline void\ndbv_set_all(dbit_vector_t* bv)\n{\n  bv_set_all_internal(bv->bit_array, bv->bv_size);\n}\n\nstatic inline void\ndbv_reset_all(dbit_vector_t* bv)\n{\n  bv_reset_all_internal(bv->bit_array, bv->bv_size);\n}\n\nstatic inline uint8_t\ndbv_no_setted_bits(dbit_vector_t bv)\n{\n  return bv_no_setted_bits_internal(bv.bit_array, bv.bv_size);\n}\n\nstatic inline uint8_t\ndbv_are_equal(dbit_vector_t bv1, dbit_vector_t bv2)\n{\n  return bv_are_equal_internal(bv1.bit_array, bv1.bv_size, bv2.bit_array,\n                               bv2.bv_size);\n}\n\nstatic inline void\ndbv_copy(dbit_vector_t* bv_dst, dbit_vector_t bv_src)\n{\n  bv_copy_internal(bv_dst->bit_array, bv_dst->bv_size, bv_src.bit_array,\n                   bv_src.bv_size);\n}\n\nstatic inline uint8_t\ndbv_is_all_set(dbit_vector_t bv)\n{\n  dbit_vector_t* bv_tmp;\n  dbv_init(&bv_tmp, bv.bv_size);\n  dbv_set_all(bv_tmp);\n  return dbv_are_equal(bv, *bv_tmp);\n}\n\n/// Bitvector bitwise ops\nstatic inline void\ndbv_reverse(dbit_vector_t* bv)\n{\n  bv_reverse_internal(bv->bit_array, bv->bv_size);\n}\n\nstatic inline void\ndbv_and(dbit_vector_t* bv_dst, dbit_vector_t bv_src)\n{\n  bv_and_internal(bv_dst->bit_array, bv_dst->bv_size, bv_src.bit_array,\n                  bv_src.bv_size);\n}\n\nstatic inline void\ndbv_or(dbit_vector_t* bv_dst, dbit_vector_t bv_src)\n{\n  bv_or_internal(bv_dst->bit_array, bv_dst->bv_size, bv_src.bit_array,\n                 bv_src.bv_size);\n}\n\n/// Bitvector Print functions\n\nstatic inline void\ndbv_print(dbit_vector_t bv)\n{\n  bv_print_internal(bv.bit_array, bv.bv_size);\n}\n\nstatic inline void\ndbv_print_enhanced(dbit_vector_t bv)\n{\n  bv_print_enhanced_internal(bv.bit_array, bv.bv_size);\n}\n\nstatic inline void\ndbv_unit_test(void)\n{\n  dbit_vector_t* bv;\n  dbit_vector_t* bv_set__all;\n  dbv_init(&bv, 22);\n  dbv_init(&bv_set__all, 22);\n  dbv_set_all(bv_set__all);\n\n  for (uint8_t i = 0; i < bv->bv_size; ++i)\n    dbv_bit_set(bv, i);\n  assert(dbv_are_equal(*bv, *bv_set__all) == 1);\n\n  for (uint8_t i = 0; i < bv->bv_size; ++i)\n    dbv_bit_reset(bv, i);\n  dbv_reverse(bv);\n  assert(dbv_are_equal(*bv, *bv_set__all) == 1);\n\n  for (uint8_t i = 0; i < bv->bv_size; ++i)\n    if (i % 2 == 0) {\n      dbv_bit_reset(bv, i);\n      assert(dbv_bit_get(*bv, i) == 0);\n    } else {\n      dbv_bit_set(bv, i);\n      assert(dbv_bit_get(*bv, i) == 1);\n    }\n\n  dbv_reset_all(bv);\n  assert(dbv_are_equal(*bv, *bv_set__all) == 0);\n\n  dbv_set_all(bv);\n  dbv_and(bv, *bv_set__all);\n  assert(dbv_are_equal(*bv, *bv_set__all) == 1);\n\n  dbv_copy(bv, *bv_set__all);\n  assert(dbv_are_equal(*bv, *bv_set__all) == 1);\n\n  dbv_reset_all(bv);\n  dbv_or(bv, *bv_set__all);\n  assert(dbv_are_equal(*bv, *bv_set__all) == 1);\n  printf(\"Dynamic Bit Vector Unit Test was Successful!\\n\");\n}\n\n/////////////////////////////////////////\n/// Static Bitvector API functions\n/////////////////////////////////////////\n\nstatic inline void\nbv_init(bit_vector_t* bv)\n{\n  bv_init_internal(bv->bit_array, BV_BIT_VECTOR_SIZE);\n}\n\nstatic inline uint8_t\nbv_bit_get(bit_vector_t bv, int bit)\n{\n  return bv_bit_get_internal(bv.bit_array, BV_BIT_VECTOR_SIZE, bit);\n}\n\nstatic inline void\nbv_bit_set(bit_vector_t* bv, uint8_t bit)\n{\n  bv_bit_set_internal(bv->bit_array, BV_BIT_VECTOR_SIZE, bit);\n}\n\nstatic inline void\nbv_bit_reset(bit_vector_t* bv, uint8_t bit)\n{\n  bv_bit_reset_internal(bv->bit_array, BV_BIT_VECTOR_SIZE, bit);\n}\n\nstatic inline void\nbv_set_all(bit_vector_t* bv)\n{\n  bv_set_all_internal(bv->bit_array, BV_BIT_VECTOR_SIZE);\n}\n\nstatic inline void\nbv_reset_all(bit_vector_t* bv)\n{\n  bv_reset_all_internal(bv->bit_array, BV_BIT_VECTOR_SIZE);\n}\n\nstatic inline uint8_t\nbv_no_setted_bits(bit_vector_t bv)\n{\n  return bv_no_setted_bits_internal(bv.bit_array, BV_BIT_VECTOR_SIZE);\n}\n\nstatic inline uint8_t\nbv_are_equal(bit_vector_t bv1, bit_vector_t bv2)\n{\n  return bv_are_equal_internal(bv1.bit_array, BV_BIT_VECTOR_SIZE, bv2.bit_array,\n                               BV_BIT_VECTOR_SIZE);\n}\n\nstatic inline void\nbv_copy(bit_vector_t* bv_dst, bit_vector_t bv_src)\n{\n  bv_copy_internal(bv_dst->bit_array, BV_BIT_VECTOR_SIZE, bv_src.bit_array,\n                   BV_BIT_VECTOR_SIZE);\n}\n\n/// Bitvector bitwise ops\nstatic inline void\nbv_reverse(bit_vector_t* bv)\n{\n  bv_reverse_internal(bv->bit_array, BV_BIT_VECTOR_SIZE);\n}\n\nstatic inline void\nbv_and(bit_vector_t* bv_dst, bit_vector_t bv_src)\n{\n  bv_and_internal(bv_dst->bit_array, BV_BIT_VECTOR_SIZE, bv_src.bit_array,\n                  BV_BIT_VECTOR_SIZE);\n}\n\nstatic inline void\nbv_or(bit_vector_t* bv_dst, bit_vector_t bv_src)\n{\n  bv_or_internal(bv_dst->bit_array, BV_BIT_VECTOR_SIZE, bv_src.bit_array,\n                 BV_BIT_VECTOR_SIZE);\n}\n\n/// Bitvector Print functions\n\nstatic inline void\nbv_print(bit_vector_t bv)\n{\n  bv_print_internal(bv.bit_array, BV_BIT_VECTOR_SIZE);\n}\n\nstatic inline void\nbv_print_enhanced(bit_vector_t bv)\n{\n  bv_print_enhanced_internal(bv.bit_array, BV_BIT_VECTOR_SIZE);\n}\n\n/////////////////////////////////////////\n/// Bitvector unit test functions\n/////////////////////////////////////////\nstatic inline void\nbv_unit_test(void)\n{\n  bit_vector_t bv;\n  bit_vector_t bv_set__all;\n  bv_init(&bv);\n  bv_set_all(&bv_set__all);\n\n  dbv_unit_test();\n\n  for (uint8_t i = 0; i < BV_BIT_VECTOR_SIZE; ++i)\n    bv_bit_set(&bv, i);\n  assert(bv_are_equal(bv, bv_set__all) == 1);\n\n  for (uint8_t i = 0; i < BV_BIT_VECTOR_SIZE; ++i)\n    bv_bit_reset(&bv, i);\n  bv_reverse(&bv);\n  assert(bv_are_equal(bv, bv_set__all) == 1);\n\n  for (uint8_t i = 0; i < BV_BIT_VECTOR_SIZE; ++i)\n    if (i % 2 == 0) {\n      bv_bit_reset(&bv, i);\n      assert(bv_bit_get(bv, i) == 0);\n    } else {\n      bv_bit_set(&bv, i);\n      assert(bv_bit_get(bv, i) == 1);\n    }\n\n  bv_reset_all(&bv);\n  assert(bv_are_equal(bv, bv_set__all) == 0);\n\n  bv_set_all(&bv);\n  bv_and(&bv, bv_set__all);\n  assert(bv_are_equal(bv, bv_set__all) == 1);\n\n  bv_copy(&bv, bv_set__all);\n  assert(bv_are_equal(bv, bv_set__all) == 1);\n\n  bv_reset_all(&bv);\n  bv_or(&bv, bv_set__all);\n  assert(bv_are_equal(bv, bv_set__all) == 1);\n  printf(\"Static  Bit Vector Unit Test was Successful!\\n\");\n}\n\n#endif  // HERMES_BIT_VECTOR_H\n"
  },
  {
    "path": "include/utils/concur_ctrl.h",
    "content": "//\n// Created by akatsarakis on 11/12/18.\n//\n\n#ifndef HERMES_SEQLOCK_H\n#define HERMES_SEQLOCK_H\n\n#include <assert.h>\n#include <stdint.h>\n\n#define ENABLE_LOCK_ASSERTS 1\n\n#define TIE_BREAKER_ID_EMPTY 255\n#define SEQLOCK_LOCKED 0x1\n#define SEQLOCK_FREE 0x0\n\n#define LOCK_PAUSE() asm volatile(\"mfence\");\n\n#define COMPILER_BARRIER() asm volatile(\"\" ::: \"memory\")\n\n#if !defined(COMPILER_NO_REORDER)\n#define COMPILER_NO_REORDER(exec) \\\n  COMPILER_BARRIER();             \\\n  exec;                           \\\n  COMPILER_BARRIER()\n#endif\n\ntypedef volatile struct {\n  uint8_t tie_breaker_id;\n  uint32_t version;\n} __attribute__((packed)) timestamp_t;\n\ntypedef struct {\n  uint8_t lock;\n  uint32_t version;  /// for lock-free reads\n} __attribute__((packed)) seqlock_t;\n\ntypedef volatile struct {\n  uint8_t lock;\n  timestamp_t\n      ts;  /// ts.version used for both lock-free reads & as part of timestamp\n} __attribute__((packed)) conc_ctrl_t;\n\n/////////////////////////////////////////\n/// Timestamp  comparison  functions\n/////////////////////////////////////////\nstatic inline void\ntimestamp_init(timestamp_t* ts)\n{\n  ts->version = 0;\n  ts->tie_breaker_id = TIE_BREAKER_ID_EMPTY;\n}\n\nstatic inline int\ntimestamp_is_equal(uint32_t v1, uint8_t tie_breaker1, uint32_t v2,\n                   uint8_t tie_breaker2)\n{\n  return (v1 == v2 && tie_breaker1 == tie_breaker2);\n}\n\nstatic inline int\ntimestamp_is_smaller(uint32_t v1, uint8_t tie_breaker1, uint32_t v2,\n                     uint8_t tie_breaker2)\n{\n  return (v1 < v2 || (v1 == v2 && tie_breaker1 < tie_breaker2));\n}\n\n/////////////////////////////////////////\n/// seqlock locking / unlocking functions\n/////////////////////////////////////////\n\nstatic inline void\nseqlock_init(seqlock_t* seqlock)\n{\n  seqlock->version = 0;\n  seqlock->lock = SEQLOCK_FREE;\n}\n\nstatic inline int\nseqlock_lock(seqlock_t* seqlock)\n{\n  do {\n    // Spin until the seqlock is unlocked\n    while (seqlock->lock == SEQLOCK_LOCKED) {\n      LOCK_PAUSE();\n    }\n\n    // try to atomically get the lock via a CAS\n    if (__sync_val_compare_and_swap(&seqlock->lock, 0, 1) == 0) {\n      seqlock->version++;\n      break;\n    }\n\n  } while (1);  // retry if CAS failed\n\n  return 1;\n}\n\nstatic inline void\nseqlock_unlock(seqlock_t* seqlock)\n{\n  if (ENABLE_LOCK_ASSERTS) {\n    assert(seqlock->lock == SEQLOCK_LOCKED);\n    assert(seqlock->version % 2 == 1);\n  }\n\n  COMPILER_NO_REORDER(seqlock->version++);\n  COMPILER_NO_REORDER(seqlock->lock = SEQLOCK_FREE);\n}\n\n// This is used to validate a lock-free read\n// i.e. --> do { <Lock free read>  } while\n// (!(seqlock_version_is_same_and_valid(...));\nstatic inline int\nseqlock_version_is_same_and_valid(seqlock_t* seqlock1, seqlock_t* seqlock2)\n{\n  return (seqlock1->version == seqlock2->version && seqlock1->version % 2 == 0);\n}\n\n/////////////////////////////////////////\n/// ccctrl locking / unlocking functions\n/////////////////////////////////////////\n\nstatic inline void\ncctrl_init(conc_ctrl_t* cctrl)\n{\n  timestamp_init(&cctrl->ts);\n  cctrl->lock = SEQLOCK_FREE;\n}\n\nstatic inline int\ncctrl_lock(conc_ctrl_t* cctrl)\n{\n  do {\n    // Spin until the seqlock is unlocked\n    while (cctrl->lock == SEQLOCK_LOCKED) {\n      LOCK_PAUSE();\n    }\n\n    // try to atomically get the lock via a CAS\n    if (__sync_val_compare_and_swap(&cctrl->lock, 0, 1) == 0) {\n      cctrl->ts.version++;\n      break;\n    }\n\n  } while (1);  // retry if CAS failed\n\n  return 1;\n}\n\nstatic inline void\ncctrl_unlock_custom_version(conc_ctrl_t* cctrl, uint8_t cid, uint32_t version)\n{\n  if (ENABLE_LOCK_ASSERTS) {\n    assert(cctrl->lock == SEQLOCK_LOCKED);\n    assert(cctrl->ts.version % 2 == 1);\n  }\n\n  cctrl->ts.tie_breaker_id = cid;\n  COMPILER_NO_REORDER(cctrl->ts.version = version);\n  COMPILER_NO_REORDER(cctrl->lock = SEQLOCK_FREE);\n}\n\nstatic inline void\ncctrl_unlock_inc_version_by_three(conc_ctrl_t* cctrl, uint8_t cid,\n                                  uint32_t* resp_version)\n{\n  if (ENABLE_LOCK_ASSERTS) {\n    assert(cctrl->lock == SEQLOCK_LOCKED);\n    assert(cctrl->ts.version % 2 == 1);\n  }\n\n  cctrl->ts.tie_breaker_id = cid;\n  COMPILER_NO_REORDER(cctrl->ts.version += 3);\n  COMPILER_NO_REORDER(*resp_version = cctrl->ts.version);\n  COMPILER_NO_REORDER(cctrl->lock = SEQLOCK_FREE);\n}\n\nstatic inline void\ncctrl_unlock_inc_version(conc_ctrl_t* cctrl, uint8_t cid,\n                         uint32_t* resp_version)\n{\n  if (ENABLE_LOCK_ASSERTS) {\n    assert(cctrl->lock == SEQLOCK_LOCKED);\n    assert(cctrl->ts.version % 2 == 1);\n  }\n\n  cctrl->ts.tie_breaker_id = cid;\n  COMPILER_NO_REORDER(*resp_version = ++cctrl->ts.version);\n  COMPILER_NO_REORDER(cctrl->lock = SEQLOCK_FREE);\n}\n\nstatic inline void\ncctrl_unlock_dec_version(conc_ctrl_t* cctrl)\n{\n  if (ENABLE_LOCK_ASSERTS) {\n    assert(cctrl->lock == SEQLOCK_LOCKED);\n    assert(cctrl->ts.version % 2 == 1);\n  }\n\n  // keep same ts.tie_breaker_id\n  COMPILER_NO_REORDER(cctrl->ts.version--);\n  COMPILER_NO_REORDER(cctrl->lock = SEQLOCK_FREE);\n}\n\n// This is used to validate a lock-free read\n// i.e. --> do { <Lock free read>  } while\n// (!(cctrl_timestamp_is_same_and_valid(...));\nstatic inline int\ncctrl_timestamp_is_same_and_valid(volatile conc_ctrl_t* cctrl1,\n                                  volatile conc_ctrl_t* cctrl2)\n{\n  return cctrl1->ts.version % 2 == 0 &&\n         timestamp_is_equal(cctrl1->ts.version, cctrl1->ts.tie_breaker_id,\n                            cctrl2->ts.version, cctrl2->ts.tie_breaker_id);\n}\n\n#endif  // HERMES_SEQLOCK_H\n"
  },
  {
    "path": "include/utils/time_rdtsc.h",
    "content": "\n#ifndef HERMES_TIME_H\n#define HERMES_TIME_H\n#include <assert.h>\n#include <stdint.h> /* for uint64_t */\n#include <stdio.h>\n#include <time.h> /* for struct timespec */\n\n#define ENABLE_STATIC_TICKS_PER_NS 1\n#define RDTSC_TYPICAL_TICKS_PER_NS 2.2\n\ndouble g_ticks_per_ns;\n\n// assembly code to read the TSC\nstatic inline uint64_t\nRDTSC()\n{\n  unsigned int hi, lo;\n  __asm__ volatile(\"rdtsc\" : \"=a\"(lo), \"=d\"(hi));\n  return ((uint64_t)hi << 32) | lo;\n}\n\nstatic const int NANO_SECONDS_IN_SEC = 1000000000;\n// returns a static buffer of struct timespec with the time difference of\n// ts1 and ts2 ts1 is assumed to be greater than ts2\nstatic struct timespec*\ntimespec_diff(struct timespec* ts1, struct timespec* ts2)\n{\n  static struct timespec ts;\n  ts.tv_sec = ts1->tv_sec - ts2->tv_sec;\n  ts.tv_nsec = ts1->tv_nsec - ts2->tv_nsec;\n  if (ts.tv_nsec < 0) {\n    ts.tv_sec--;\n    ts.tv_nsec += NANO_SECONDS_IN_SEC;\n  }\n  return &ts;\n}\n\nstatic void\ncalibrate_ticks()\n{\n  struct timespec begin_ts, end_ts;\n  printf(\"Start RDTSC calibration: patience is a virtue\\n\");\n  clock_gettime(CLOCK_MONOTONIC, &begin_ts);\n  uint64_t begin = RDTSC();\n  // do something CPU intensive\n  for (volatile unsigned long long i = 0; i < 1000000000ULL; ++i)\n    ;\n  uint64_t end = RDTSC();\n  clock_gettime(CLOCK_MONOTONIC, &end_ts);\n  struct timespec* tmp_ts = timespec_diff(&end_ts, &begin_ts);\n  uint64_t ns_elapsed =\n      (uint64_t)(tmp_ts->tv_sec * 1000000000LL + tmp_ts->tv_nsec);\n  g_ticks_per_ns = (double)(end - begin) / (double)ns_elapsed;\n  printf(\"RDTSC calibration is done (ticks_per_ns: %.2f)\\n\", g_ticks_per_ns);\n}\n\n// Call once (it is not thread safe) before using RDTSC, has side effect of\n// binding process to CPU1\nstatic inline void\ninit_rdtsc(uint8_t auto_calibration, double ticks_per_ns)\n{\n  if (auto_calibration > 0)\n    calibrate_ticks();\n  else {\n    assert(ticks_per_ns > 0);\n    g_ticks_per_ns = ticks_per_ns;\n  }\n}\n\nstatic inline void\nget_timespec(struct timespec* ts, uint64_t nsecs)\n{\n  ts->tv_sec = nsecs / NANO_SECONDS_IN_SEC;\n  ts->tv_nsec = nsecs % NANO_SECONDS_IN_SEC;\n}\n\n// ts will be filled with time converted from TSC reading\nstatic inline void\nget_rdtsc_timespec(struct timespec* ts)\n{\n  get_timespec(ts, (uint64_t)(RDTSC() / g_ticks_per_ns));\n}\n\nstatic inline double\ntime_elapsed_in_us(struct timespec start)\n{\n  struct timespec now, *diff;\n  get_rdtsc_timespec(&now);\n  diff = timespec_diff(&now, &start);\n  return diff->tv_sec * 1000000 + diff->tv_nsec / 1000;\n}\n\nstatic inline double\ntime_elapsed_in_ms(struct timespec start)\n{\n  struct timespec now, *diff;\n  get_rdtsc_timespec(&now);\n  diff = timespec_diff(&now, &start);\n  return diff->tv_sec * 1000 + diff->tv_nsec / 1000000;\n}\n\nstatic inline double\ntime_elapsed_in_sec(struct timespec start)\n{\n  struct timespec now, *diff;\n  get_rdtsc_timespec(&now);\n  diff = timespec_diff(&now, &start);\n  return diff->tv_sec + diff->tv_nsec / NANO_SECONDS_IN_SEC;\n}\n\n#endif  // HERMES_TIME_H\n"
  },
  {
    "path": "include/wings/wings.h",
    "content": "//\n// Created by akatsarakis on 06/02/19.\n//\n\n#ifndef WINGS_INTERNAL_INLINES_H\n#define WINGS_INTERNAL_INLINES_H\n\n#include \"wings_api.h\"\n/// WARNING!!\n/// \tFunctions starting with underscore (i.e. \"_wings_*\")\n/// \tare internal and should not be called directly\n\nvoid wings_reconfigure_wrs_ah(ud_channel_t* ud_c, uint8_t endpoint_id);\n\n/* --------------------------------------------------------------------------\n--------------------------------- Helper Functions --------------------------\n---------------------------------------------------------------------------*/\nstatic inline void\n_wings_assert_binary(uint8_t var)\n{\n  assert(var == 0 || var == 1);\n}\n\nstatic inline uint16_t\n_wings_ud_recv_max_pkt_size(ud_channel_t* ud_c)\n{\n  if (WINGS_ENABLE_ASSERTIONS)\n    assert(ud_c->type != CRD && ud_c->is_header_only == 0);\n  // TODO add assertion that this must be smaller than max_MTU\n  assert(ud_c->max_msg_size > 0 && ud_c->max_coalescing > 0);\n  return sizeof(wings_ud_recv_pkt_t) +\n         ud_c->max_msg_size * ud_c->max_coalescing;\n}\n\nstatic inline uint16_t\n_wings_ud_send_max_pkt_size(ud_channel_t* ud_c)\n{\n  if (WINGS_ENABLE_ASSERTIONS)\n    assert(ud_c->type != CRD && ud_c->is_header_only == 0);\n  // TODO add assertion that this must be smaller than max_MTU\n  assert(ud_c->max_msg_size > 0 && ud_c->max_coalescing > 0);\n  return sizeof(wings_ud_send_pkt_t) +\n         ud_c->max_msg_size * ud_c->max_coalescing;\n}\n\nstatic inline void\n_wings_assertions(ud_channel_t* ud_channel)\n{\n  _wings_assert_binary(ud_channel->expl_crd_ctrl);\n  _wings_assert_binary(ud_channel->is_bcast_channel);\n  _wings_assert_binary(ud_channel->is_inlining_enabled);\n\n  assert(ud_channel->num_channels > 1);\n  assert(ud_channel->max_msg_size > 0);\n  assert(ud_channel->max_coalescing > 0);\n  assert(_wings_ud_send_max_pkt_size(ud_channel) < MAX_MTU_SIZE);\n  assert(ud_channel->send_q_depth > 0 || ud_channel->recv_q_depth > 0);\n  assert(ud_channel->channel_providing_crds != NULL ||\n         ud_channel->disable_crd_ctrl);\n}\n\nstatic inline uint8_t*\n_wings_get_n_msg_ptr_from_send_pkt(ud_channel_t* ud_c, wings_ud_send_pkt_t* pkt,\n                                   uint8_t n)\n{\n  if (WINGS_ENABLE_ASSERTIONS)\n    assert(ud_c->type != CRD && ud_c->is_header_only == 0);\n  assert(ud_c->max_coalescing > n && pkt->req_num >= n);\n  //    return &pkt->reqs[n * ud_c->max_msg_size];\n  return &pkt->reqs[n * ud_c->small_msg_size];\n}\n\nstatic inline uint8_t*\n_wings_get_n_msg_ptr_from_recv_pkt(ud_channel_t* ud_c,\n                                   wings_ud_recv_pkt_t* recv_pkt, uint8_t n)\n{\n  if (WINGS_ENABLE_ASSERTIONS)\n    assert(ud_c->type != CRD && ud_c->is_header_only == 0);\n  return _wings_get_n_msg_ptr_from_send_pkt(ud_c, &recv_pkt->pkt, n);\n}\n\nstatic inline wings_ud_send_pkt_t*\n_wings_get_nth_pkt_ptr_from_send_buff(ud_channel_t* ud_c, uint16_t n)\n{\n  if (WINGS_ENABLE_ASSERTIONS)\n    assert(ud_c->type != CRD && ud_c->is_header_only == 0);\n  return (wings_ud_send_pkt_t*)&(\n      (uint8_t*)ud_c->send_pkt_buff)[n * _wings_ud_send_max_pkt_size(ud_c)];\n}\n\nstatic inline wings_ud_recv_pkt_t*\n_wings_get_nth_pkt_ptr_from_recv_buff(ud_channel_t* ud_c, uint16_t n)\n{\n  if (WINGS_ENABLE_ASSERTIONS)\n    assert(ud_c->type != CRD && ud_c->is_header_only == 0);\n  return (wings_ud_recv_pkt_t*)&ud_c\n      ->recv_pkt_buff[n * _wings_ud_recv_max_pkt_size(ud_c)];\n}\n\nstatic inline wings_ud_send_pkt_t*\n_wings_curr_send_pkt_ptr(ud_channel_t* ud_c)\n{\n  if (WINGS_ENABLE_ASSERTIONS)\n    assert(ud_c->type != CRD && ud_c->is_header_only == 0);\n  return _wings_get_nth_pkt_ptr_from_send_buff(ud_c,\n                                               (uint16_t)ud_c->send_push_ptr);\n}\n\nstatic inline void\n_wings_inc_send_push_ptr(ud_channel_t* ud_c)\n{\n  if (ud_c->is_header_only) return;\n\n  if (WINGS_ENABLE_ASSERTIONS)\n    assert(ud_c->type != CRD && ud_c->is_header_only == 0);\n\n  if (ud_c->is_bcast_channel)\n    WINGS_MOD_ADD(ud_c->send_push_ptr,\n                  ud_c->send_pkt_buff_len);  // TODO change this to deal with\n                                             // failures see comment below\n  //      WINGS_MOD_ADD(*inv_push_ptr, INV_SEND_OPS_SIZE / MAX_REMOTE_MACHINES *\n  //                               last_g_membership.num_of_alive_remotes);\n  //                               //got to the next \"packet\" + dealing with\n  //                               failutes\n  else\n    WINGS_MOD_ADD(ud_c->send_push_ptr, ud_c->send_pkt_buff_len);\n  _wings_curr_send_pkt_ptr(ud_c)->req_num =\n      0;  // Reset data left from previous unicasts / bcasts\n}\n\nstatic inline void\n_wings_inc_recv_push_ptr(ud_channel_t* ud_c)\n{\n  if (WINGS_ENABLE_ASSERTIONS)\n    assert(ud_c->type != CRD && ud_c->is_header_only == 0);\n  WINGS_MOD_ADD(ud_c->recv_push_ptr, ud_c->recv_q_depth);\n}\n\nstatic inline void\n_wings_inc_recv_pull_ptr(ud_channel_t* ud_c)\n{\n  if (WINGS_ENABLE_ASSERTIONS)\n    assert(ud_c->type != CRD && ud_c->is_header_only == 0);\n  WINGS_MOD_ADD(ud_c->recv_pull_ptr, ud_c->recv_pkt_buff_len);\n}\n\n/* ---------------------------------------------------------------------------\n----------------------------------- RECVs ------------------------------------\n---------------------------------------------------------------------------*/\nstatic inline void\n_wings_post_hdr_only_recvs(ud_channel_t* ud_c, uint16_t num_recvs)\n{\n  if (WINGS_ENABLE_ASSERTIONS)\n    assert(ud_c->is_header_only || ud_c->type == CRD);\n\n  struct ibv_recv_wr* bad_recv_wr;\n  for (uint16_t i = 0; i < num_recvs; ++i)\n    ud_c->recv_wr[i].next = (i == num_recvs - 1) ? NULL : &ud_c->recv_wr[i + 1];\n\n  int ret = ibv_post_recv(ud_c->qp, ud_c->recv_wr, &bad_recv_wr);\n  CPE(ret, \"ibv_post_recv error: posting recvs for credits\", ret);\n}\n\nstatic inline void\n_wings_post_recvs(ud_channel_t* ud_c, uint16_t num_of_receives)\n{\n  if (WINGS_ENABLE_ASSERTIONS)\n    assert(ud_c->type != CRD && ud_c->is_header_only == 0);\n\n  void* next_buff_addr;\n\n  if (WINGS_ENABLE_ASSERTIONS) assert(num_of_receives <= ud_c->max_recv_wrs);\n\n  int req_size = _wings_ud_recv_max_pkt_size(ud_c);\n  for (int i = 0; i < num_of_receives; ++i) {\n    next_buff_addr =\n        (void*)(ud_c->recv_pkt_buff) + (ud_c->recv_push_ptr * req_size);\n    // TODO optimize by reseting only the req_num of wings_recv_pkt\n    memset(next_buff_addr, 0,\n           (size_t)req_size);  // reset the buffer before posting the receive\n\n    if (WINGS_ENABLE_BATCH_POST_RECVS_TO_NIC)\n      ud_c->recv_wr[i].sg_list->addr = (uintptr_t)next_buff_addr;\n    else\n      assert(0);\n    //\t\t\thrd_post_dgram_recv(ud_c->qp, next_buff_addr, req_size,\n    // cb->dgram_buf_mr->lkey);\n\n    _wings_inc_recv_push_ptr(ud_c);\n  }\n\n  if (WINGS_ENABLE_BATCH_POST_RECVS_TO_NIC) {\n    ud_c->recv_wr[num_of_receives - 1].next = NULL;\n    if (WINGS_ENABLE_ASSERTIONS) {\n      for (int i = 0; i < num_of_receives; i++) {\n        assert(ud_c->recv_wr[i].num_sge == 1);\n        assert(ud_c->recv_wr[i].sg_list->length == req_size);\n        // TODO add\n        //\t\t\t\tassert(ud_c->recv_wr[i].sg_list->lkey ==\n        // cb->dgram_buf_mr->lkey);\n        assert(i == num_of_receives - 1 ||\n               ud_c->recv_wr[i].next == &ud_c->recv_wr[i + 1]);\n      }\n      assert(ud_c->recv_wr[num_of_receives - 1].next == NULL);\n    }\n\n    struct ibv_recv_wr* bad_recv_wr;\n    int ret = ibv_post_recv(ud_c->qp, ud_c->recv_wr, &bad_recv_wr);\n    CPE(ret, \"ibv_post_recv error: while posting recvs\", ret);\n\n    // recover next ptr of last wr to NULL\n    ud_c->recv_wr[num_of_receives - 1].next =\n        (ud_c->max_recv_wrs == num_of_receives - 1)\n            ? NULL\n            : &ud_c->recv_wr[num_of_receives];\n  }\n}\n\nstatic inline void\n_wings_poll_crds_and_post_recvs(ud_channel_t* ud_c)\n{\n  if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type == CRD);\n\n  int crd_pkts_found =\n      ibv_poll_cq(ud_c->recv_cq, ud_c->max_recv_wrs, ud_c->recv_wc);\n\n  if (crd_pkts_found > 0) {\n    if (unlikely(ud_c->recv_wc[crd_pkts_found - 1].status != 0)) {\n      fprintf(stderr,\n              \"Bad wc status when polling for credits to send a broadcast %d\\n\",\n              ud_c->recv_wc[crd_pkts_found - 1].status);\n      exit(0);\n    }\n\n    if (ud_c->enable_stats) ud_c->stats.recv_total_pkts += crd_pkts_found;\n\n    if (WINGS_ENABLE_RECV_PRINTS && ud_c->enable_prints)\n      colored_printf(GREEN, \"^^^ Polled reqs: %s  %d, (total: %d)!\\n\",\n                     ud_c->qp_name, crd_pkts_found,\n                     ud_c->stats.recv_total_pkts);\n\n    for (int i = 0; i < crd_pkts_found; i++) {\n      wings_crd_t* crd_ptr = (wings_crd_t*)&ud_c->recv_wc[i].imm_data;\n\n      if (ud_c->enable_stats) ud_c->stats.recv_total_msgs += crd_ptr->crd_num;\n      ud_c->channel_providing_crds->credits_per_channels[crd_ptr->sender_id] +=\n          crd_ptr->crd_num;\n\n      if (WINGS_ENABLE_ASSERTIONS)\n        assert(ud_c->channel_providing_crds->num_crds_per_channel >=\n               ud_c->channel_providing_crds\n                   ->credits_per_channels[crd_ptr->sender_id]);\n\n      if (WINGS_ENABLE_CREDIT_PRINTS && ud_c->enable_prints)\n        printf(\n            \"$$$ Credits: %s \\033[1m\\033[32mincremented\\033[0m to %d (for \"\n            \"endpoint %d)\\n\",\n            ud_c->channel_providing_crds->qp_name,\n            ud_c->channel_providing_crds\n                ->credits_per_channels[crd_ptr->sender_id],\n            crd_ptr->sender_id);\n    }\n\n    if (WINGS_ENABLE_POST_RECV_PRINTS && ud_c->enable_prints)\n      colored_printf(YELLOW, \"vvv Post Receives: %s %d\\n\", ud_c->qp_name,\n                     crd_pkts_found);\n\n    _wings_post_hdr_only_recvs(ud_c, (uint16_t)crd_pkts_found);\n\n  } else if (unlikely(crd_pkts_found < 0)) {\n    printf(\"ERROR In the credit CQ\\n\");\n    exit(0);\n  }\n}\n\nstatic inline void\n_wings_enque_to_overflown_msgs(ud_channel_t* ud_c, uint8_t* msg_ptr)\n{\n  if (WINGS_ENABLE_ASSERTIONS) {\n    assert(ud_c->is_header_only == 0);\n    assert(ud_c->enable_overflow_msgs);\n    assert(ud_c->num_overflow_msgs < ud_c->max_coalescing);\n  }\n\n  uint8_t* dst_ptr =\n      &ud_c->overflow_msg_buff[ud_c->num_overflow_msgs * ud_c->max_msg_size];\n\n  memcpy(dst_ptr, msg_ptr, ud_c->max_msg_size);\n  ud_c->num_overflow_msgs++;\n}\n\nstatic inline uint16_t\n_wings_deque_from_overflown_msgs(ud_channel_t* ud_c, uint16_t max_msgs_to_poll,\n                                 uint8_t* recv_ops)\n{\n  if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->is_header_only == 0);\n\n  uint8_t msgs_to_copy = (uint8_t)(ud_c->num_overflow_msgs <= max_msgs_to_poll\n                                       ? ud_c->num_overflow_msgs\n                                       : max_msgs_to_poll);\n\n  if (ud_c->num_overflow_msgs > 0) {\n    ud_c->num_overflow_msgs -= msgs_to_copy;\n\n    // Copy msgs from overflow_buff to recv_ops\n    memcpy(recv_ops, ud_c->overflow_msg_buff,\n           msgs_to_copy * ud_c->max_msg_size);\n\n    if (msgs_to_copy == max_msgs_to_poll)\n      // Move rest of overflown msgs to the top of the (FIFO) buffer\n      for (int i = 0; i < ud_c->num_overflow_msgs; ++i) {\n        uint8_t* dst_ptr = &ud_c->overflow_msg_buff[ud_c->max_msg_size * i];\n        uint8_t* src_ptr =\n            &ud_c->overflow_msg_buff[ud_c->max_msg_size * (i + msgs_to_copy)];\n        memcpy(dst_ptr, src_ptr, ud_c->max_msg_size);\n      }\n  }\n\n  return msgs_to_copy;\n}\n\nstatic inline uint16_t\nwings_poll_buff_and_post_recvs(ud_channel_t* ud_c, uint16_t max_msgs_to_poll,\n                               uint8_t* recv_ops)\n{\n  if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type != CRD);\n\n  int index = 0;\n  uint8_t sender = 0;\n  uint16_t msgs_polled = 0;\n  uint8_t *next_packet_reqs, *recv_op_ptr, *next_req, *next_packet_req_num_ptr;\n\n  uint16_t dequed_msgs = 0;\n  uint16_t remaining_msgs_to_poll = max_msgs_to_poll;\n\n  if (max_msgs_to_poll < 1) return 0;\n\n  if (ud_c->enable_overflow_msgs) {\n    dequed_msgs =\n        _wings_deque_from_overflown_msgs(ud_c, max_msgs_to_poll, recv_ops);\n\n    if (max_msgs_to_poll == dequed_msgs) return max_msgs_to_poll;\n\n    recv_ops = &recv_ops[dequed_msgs * ud_c->max_msg_size];\n    remaining_msgs_to_poll -= dequed_msgs;\n  }\n\n  uint16_t max_pkts_to_poll =\n      (uint16_t)((remaining_msgs_to_poll / ud_c->max_coalescing) +\n                 (ud_c->enable_overflow_msgs ? 1 : 0));\n\n  // poll completion q\n  uint16_t pkts_polled =\n      (uint16_t)ibv_poll_cq(ud_c->recv_cq, max_pkts_to_poll, ud_c->recv_wc);\n\n  for (int i = 0; i < pkts_polled; ++i) {\n    if (ud_c->is_header_only) {\n      recv_op_ptr = &recv_ops[i * ud_c->max_msg_size];\n      memcpy(recv_op_ptr, &ud_c->recv_wc[i].imm_data, ud_c->max_msg_size);\n\n      msgs_polled++;\n\n      sender = ((wings_hdr_only_t*)&ud_c->recv_wc[i].imm_data)->sender_id;\n      if (!ud_c->disable_crd_ctrl)\n        ud_c->channel_providing_crds\n            ->credits_per_channels[sender]++;  // increment packet credits\n\n    } else {\n      uint16_t max_req_size = _wings_ud_recv_max_pkt_size(ud_c);\n      index = (ud_c->recv_pull_ptr + 1) % ud_c->recv_q_depth;\n      wings_ud_recv_pkt_t* next_packet =\n          (wings_ud_recv_pkt_t*)&ud_c->recv_pkt_buff[index * max_req_size];\n\n      sender = next_packet->pkt.sender_id;\n      next_packet_reqs = next_packet->pkt.reqs;\n      next_packet_req_num_ptr = &next_packet->pkt.req_num;\n\n      if (WINGS_ENABLE_ASSERTIONS)\n        assert(next_packet->pkt.req_num > 0 &&\n               next_packet->pkt.req_num <= ud_c->max_coalescing);\n\n      // TODO add membership and functionality\n      //        if(node_is_in_membership(last_group_membership, sender))\n\n      uint16_t msg_size = next_packet->pkt.only_small_msgs == 1\n                              ? ud_c->small_msg_size\n                              : ud_c->max_msg_size;\n      for (int j = 0; j < next_packet->pkt.req_num; ++j) {\n        next_req = &next_packet_reqs[j * msg_size];\n\n        if (msgs_polled >= remaining_msgs_to_poll)\n          _wings_enque_to_overflown_msgs(ud_c, next_req);\n        else {\n          recv_op_ptr = &recv_ops[msgs_polled * ud_c->max_msg_size];\n          memcpy(recv_op_ptr, next_req, msg_size);\n        }\n\n        msgs_polled++;\n        if (!ud_c->disable_crd_ctrl)\n          ud_c->channel_providing_crds\n              ->credits_per_channels[sender]++;  // increment packet credits\n      }\n\n      *next_packet_req_num_ptr =\n          0;  // TODO can be removed since we already reset on posting receives\n      _wings_inc_recv_pull_ptr(ud_c);\n    }\n\n    if (WINGS_ENABLE_ASSERTIONS)\n      if (!ud_c->disable_crd_ctrl)\n        assert(ud_c->channel_providing_crds->credits_per_channels[sender] <=\n               ud_c->channel_providing_crds->num_crds_per_channel);\n  }\n\n  if (pkts_polled > 0) {\n    // Refill recvs\n    if (ud_c->is_header_only)\n      _wings_post_hdr_only_recvs(ud_c, pkts_polled);\n    else\n      _wings_post_recvs(ud_c, pkts_polled);\n\n    if (WINGS_ENABLE_STAT_COUNTING) {\n      ud_c->stats.recv_total_msgs += msgs_polled;\n      ud_c->stats.recv_total_pkts += pkts_polled;\n    }\n\n    if (WINGS_ENABLE_RECV_PRINTS && ud_c->enable_prints)\n      colored_printf(\n          GREEN,\n          \"^^^ Polled msgs: %d packets %s %d, (total pkts: %d, msgs %d)!\\n\",\n          pkts_polled, ud_c->qp_name, msgs_polled, ud_c->stats.recv_total_pkts,\n          ud_c->stats.recv_total_msgs);\n    if (WINGS_ENABLE_CREDIT_PRINTS && ud_c->enable_prints &&\n        !ud_c->disable_crd_ctrl)\n      printf(\n          \"$$$ Credits: %s \\033[1m\\033[32mincremented\\033[0m to %d (for \"\n          \"machine %d)\\n\",\n          ud_c->channel_providing_crds->qp_name,\n          ud_c->channel_providing_crds->credits_per_channels[sender], sender);\n    if (WINGS_ENABLE_POST_RECV_PRINTS && ud_c->enable_prints)\n      colored_printf(YELLOW, \"vvv Post Receives: %s %d\\n\", ud_c->qp_name,\n                     pkts_polled);\n\n    if (WINGS_ENABLE_ASSERTIONS)\n      assert(ud_c->max_coalescing != 1 || pkts_polled == msgs_polled);\n  }\n\n  return msgs_polled + dequed_msgs >= max_msgs_to_poll\n             ? max_msgs_to_poll\n             : msgs_polled + dequed_msgs;\n}\n\n/* ---------------------------------------------------------------------------\n----------------------------------- CREDITS ----------------------------------\n---------------------------------------------------------------------------*/\nstatic inline uint8_t\n_wings_node_is_in_membership(uint8_t node_id, bit_vector_t membership)\n{\n  if (WINGS_ENABLE_ASSERTIONS) assert(node_id < 8);\n\n  return bv_bit_get(membership, node_id) == 1 ? 1 : 0;\n}\n\n// For all the CREDIT functions --> if its a bcast channel endpoint_id is\n// ignored\nstatic inline uint8_t\n_wings_has_sufficient_crds_no_polling_membership(ud_channel_t* ud_c,\n                                                 uint8_t endpoint_id,\n                                                 bit_vector_t* membership)\n{\n  uint8_t check_membership = membership == NULL ? 0 : 1;\n\n  if (ud_c->disable_crd_ctrl)\n    return 1;\n\n  else if (!ud_c->is_bcast_channel)\n    return (uint8_t)(ud_c->credits_per_channels[endpoint_id] > 0);\n\n  else\n    for (int i = 0; i < ud_c->num_channels; ++i) {\n      if (i == ud_c->channel_id) continue;\n      if (check_membership == 1 &&\n          !_wings_node_is_in_membership(i, *membership))\n        continue;  // skip machine if not in membership\n      if (ud_c->credits_per_channels[i] <= 0) return 0;\n    }\n\n  return 1;\n}\n\n// For all the CREDIT functions --> if its a bcast channel endpoint_id is\n// ignored\nstatic inline uint8_t\n_wings_has_sufficient_crds_no_polling(ud_channel_t* ud_c, uint8_t endpoint_id)\n{\n  return _wings_has_sufficient_crds_no_polling_membership(ud_c, endpoint_id,\n                                                          NULL);\n}\n\nstatic inline uint8_t\n_wings_has_sufficient_crds_membership(ud_channel_t* ud_c, uint8_t endpoint_id,\n                                      bit_vector_t* membership)\n{\n  if (_wings_has_sufficient_crds_no_polling_membership(ud_c, endpoint_id,\n                                                       membership))\n    return 1;\n\n  if (ud_c->expl_crd_ctrl) {\n    _wings_poll_crds_and_post_recvs(ud_c->channel_providing_crds);\n\n    if (_wings_has_sufficient_crds_no_polling_membership(ud_c, endpoint_id,\n                                                         membership))\n      return 1;\n  }\n  return 0;\n}\n\nstatic inline uint8_t\n_wings_has_sufficient_crds(ud_channel_t* ud_c, uint8_t endpoint_id)\n{\n  if (_wings_has_sufficient_crds_no_polling(ud_c, endpoint_id)) return 1;\n\n  if (ud_c->expl_crd_ctrl) {\n    _wings_poll_crds_and_post_recvs(ud_c->channel_providing_crds);\n\n    if (_wings_has_sufficient_crds_no_polling(ud_c, endpoint_id)) return 1;\n  }\n  return 0;\n}\n\nstatic inline void\n_wings_dec_crds_membership(ud_channel_t* ud_c, uint8_t endpoint_id,\n                           bit_vector_t* membership)\n{\n  uint8_t check_membership = membership == NULL ? 0 : 1;\n\n  if (ud_c->disable_crd_ctrl) return;\n\n  if (WINGS_ENABLE_ASSERTIONS)\n    assert(_wings_has_sufficient_crds_no_polling_membership(ud_c, endpoint_id,\n                                                            membership));\n\n  if (!ud_c->is_bcast_channel)\n    ud_c->credits_per_channels[endpoint_id]--;\n  else\n    for (int i = 0; i < ud_c->num_channels; ++i) {\n      if (i == ud_c->channel_id) continue;\n      if (check_membership == 1 &&\n          !_wings_node_is_in_membership(i, *membership))\n        continue;  // skip machine if not in membership\n      ud_c->credits_per_channels[i]--;\n    }\n\n  if (WINGS_ENABLE_CREDIT_PRINTS && ud_c->enable_prints) {\n    if (ud_c->is_bcast_channel)\n      endpoint_id = (uint8_t)(ud_c->channel_id == 0 ? 1 : 0);\n\n    printf(\"$$$ Credits: %s \\033[31mdecremented\\033[0m to %d\", ud_c->qp_name,\n           ud_c->credits_per_channels[endpoint_id]);\n\n    if (ud_c->is_bcast_channel)\n      printf(\" (all endpoints)\\n\");\n    else\n      printf(\" (for endpoint %d)\\n\", endpoint_id);\n  }\n}\n\nstatic inline void\n_wings_dec_crds(ud_channel_t* ud_c, uint8_t endpoint_id)\n{\n  _wings_dec_crds_membership(ud_c, endpoint_id, NULL);\n}\n\nstatic inline void\nwings_reset_credits(ud_channel_t* ud_c, uint8_t endpoint_id)\n{\n  ud_c->credits_per_channels[endpoint_id] =\n      (uint16_t)ud_c->channel_providing_crds->num_crds_per_channel;\n}\n\n/* ---------------------------------------------------------------------------\n----------------------------------- SENDs ------------------------------------\n---------------------------------------------------------------------------*/\nstatic inline void\n_wings_forge_crd_wr(ud_channel_t* ud_c, uint16_t dst_qp_id,\n                    uint16_t crd_pkts_to_send, uint16_t crd_to_send)\n{\n  if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type == CRD);\n\n  ud_c->send_wr[crd_pkts_to_send].send_flags = IBV_SEND_INLINE;\n  ud_c->send_wr[crd_pkts_to_send].wr.ud.ah = ud_c->remote_qps[dst_qp_id].ah;\n  ud_c->send_wr[crd_pkts_to_send].wr.ud.remote_qpn =\n      ud_c->remote_qps[dst_qp_id].qpn;\n\n  ((wings_crd_t*)&ud_c->send_wr[crd_pkts_to_send].imm_data)->crd_num =\n      crd_to_send;\n\n  if (ud_c->enable_stats) ud_c->stats.send_total_msgs += crd_to_send;\n\n  if (crd_pkts_to_send > 0)\n    ud_c->send_wr[crd_pkts_to_send - 1].next = &ud_c->send_wr[crd_pkts_to_send];\n\n  // Selective Signaling --> Do a Signaled Send every ss_granularity pkts\n  if (ud_c->total_pkts_send % ud_c->ss_granularity == 0) {\n    // if not the first SS --> poll the previous SS completion\n    if (ud_c->total_pkts_send > 0) {\n      struct ibv_wc signal_send_wc;\n      hrd_poll_cq(ud_c->send_cq, 1, &signal_send_wc);\n\n      if (ud_c->enable_stats) ud_c->stats.ss_completions++;\n\n      if (WINGS_ENABLE_SS_PRINTS && ud_c->enable_prints)\n        colored_printf(RED, \"^^^ Polled SS completion: %s %d (total %d)\\n\",\n                       ud_c->qp_name, 1, ud_c->stats.ss_completions);\n    }\n\n    ud_c->send_wr[crd_pkts_to_send].send_flags |= IBV_SEND_SIGNALED;\n    if (WINGS_ENABLE_SS_PRINTS && ud_c->enable_prints)\n      colored_printf(RED, \"vvv Send SS: %s\\n\", ud_c->qp_name);\n  }\n  ud_c->total_pkts_send++;\n}\n\nstatic inline void\n_wings_forge_wr(ud_channel_t* ud_c, uint8_t dst_qp_id, uint8_t* req_to_copy,\n                uint16_t pkts_in_batch, uint16_t* msgs_in_batch,\n                copy_and_modify_input_elem_t copy_and_modify_elem,\n                uint8_t is_small_msg)\n// dst_qp_id is ignored if its a bcast channel\n{\n  struct ibv_wc signal_send_wc;\n\n  uint8_t curr_req_num = 1;\n  uint8_t* next_req_ptr;\n\n  if (ud_c->is_header_only)\n    next_req_ptr = ((wings_hdr_only_t*)&ud_c->send_wr[pkts_in_batch].imm_data)\n                       ->inlined_payload;\n  else {\n    wings_ud_send_pkt_t* curr_pkt_ptr = _wings_curr_send_pkt_ptr(ud_c);\n    next_req_ptr = _wings_get_n_msg_ptr_from_send_pkt(ud_c, curr_pkt_ptr,\n                                                      curr_pkt_ptr->req_num);\n    curr_req_num = ++curr_pkt_ptr->req_num;\n    curr_pkt_ptr->sender_id = ud_c->channel_id;\n    uint16_t msg_size =\n        is_small_msg == 1 ? ud_c->small_msg_size : ud_c->max_msg_size;\n    ud_c->send_sgl[pkts_in_batch].length =\n        sizeof(wings_ud_send_pkt_t) +\n        //                                               ud_c->max_msg_size *\n        //                                               curr_pkt_ptr->req_num;\n        msg_size * curr_pkt_ptr->req_num;\n    if (WINGS_ENABLE_ASSERTIONS)\n      assert(is_small_msg == 1 ||\n             curr_req_num == 1);  // we only do coalescing for small msgs\n\n    if (curr_req_num == 1) {\n      ud_c->send_sgl[pkts_in_batch].addr = (uint64_t)curr_pkt_ptr;\n#if WINGS_ENABLE_TWO_MSG_SIZES == 1\n      curr_pkt_ptr->only_small_msgs = is_small_msg == 1 ? 1 : 0;\n#endif\n    }\n  }\n\n  //<Copy & modify elem!> --> callback func that copies and manipulated data\n  // from req_to_copy buff\n  copy_and_modify_elem(next_req_ptr, req_to_copy);\n\n  if (WINGS_ENABLE_ASSERTIONS) {\n    assert(dst_qp_id != machine_id || ud_c->is_bcast_channel);\n    assert(curr_req_num <= ud_c->max_coalescing);\n  }\n\n  if (ud_c->enable_stats) ud_c->stats.send_total_msgs++;\n\n  if (curr_req_num == 1) {\n    if (!ud_c->is_bcast_channel) {  // set the dst qp\n      ud_c->send_wr[pkts_in_batch].wr.ud.ah = ud_c->remote_qps[dst_qp_id].ah;\n      ud_c->send_wr[pkts_in_batch].wr.ud.remote_qpn =\n          ud_c->remote_qps[dst_qp_id].qpn;\n    }\n\n    uint16_t wr_idx =\n        (uint16_t)(pkts_in_batch *\n                   (ud_c->is_bcast_channel ? ud_c->num_channels - 1 : 1));\n    ud_c->send_wr[wr_idx].send_flags =\n        ud_c->is_inlining_enabled ? IBV_SEND_INLINE : 0;\n\n    if (wr_idx > 0)  // set previous send_wr to point to curr\n      ud_c->send_wr[wr_idx - 1].next = &ud_c->send_wr[wr_idx];\n\n    // Selective Signaling --> Do a Signaled Send every ss_granularity pkts\n    if (ud_c->total_pkts_send % ud_c->ss_granularity == 0) {\n      // if not the first SS --> poll the previous SS completion\n      if (ud_c->total_pkts_send > 0) {\n        hrd_poll_cq(ud_c->send_cq, 1, &signal_send_wc);\n\n        if (ud_c->enable_stats) ud_c->stats.ss_completions++;\n\n        if (WINGS_ENABLE_SS_PRINTS && ud_c->enable_prints)\n          colored_printf(RED, \"^^^ Polled SS completion: %s %d (total %d)\\n\",\n                         ud_c->qp_name, 1, ud_c->stats.ss_completions);\n      }\n\n      ud_c->send_wr[wr_idx].send_flags |= IBV_SEND_SIGNALED;\n      if (WINGS_ENABLE_SS_PRINTS && ud_c->enable_prints)\n        colored_printf(RED, \"vvv Send SS: %s\\n\", ud_c->qp_name);\n    }\n    ud_c->total_pkts_send++;\n  }\n\n  (*msgs_in_batch)++;\n}\n\nstatic inline void\n_wings_batch_pkts_2_NIC(ud_channel_t* ud_c, uint16_t pkts_in_batch,\n                        uint16_t msgs_in_batch)\n{\n  int ret;\n  struct ibv_send_wr* bad_send_wr;\n\n  if (ud_c->enable_stats) ud_c->stats.send_total_pkts += pkts_in_batch;\n\n  uint16_t remote_channels = (uint16_t)(ud_c->num_channels - 1);\n  uint16_t wr_idx = (uint16_t)(pkts_in_batch *\n                               (ud_c->is_bcast_channel ? remote_channels : 1));\n  ud_c->send_wr[wr_idx - 1].next = NULL;\n\n  if (WINGS_ENABLE_ASSERTIONS) {\n    assert(pkts_in_batch <= ud_c->max_send_wrs);\n    assert(pkts_in_batch <= ud_c->send_pkt_buff_len);\n    assert(ud_c->type == CRD || ud_c->max_coalescing > 1 ||\n           msgs_in_batch == pkts_in_batch);\n    assert(ud_c->type == CRD || ud_c->max_coalescing > 1 ||\n           ud_c->stats.send_total_msgs == ud_c->stats.send_total_pkts);\n\n    assert(ud_c->send_wr[wr_idx - 1].next == NULL);\n    for (int i = 0; i < wr_idx; ++i) {\n      uint16_t sgl_idx =\n          (uint16_t)(i / (ud_c->is_bcast_channel ? remote_channels : 1));\n\n      if (ud_c->type != CRD && !ud_c->is_header_only) {\n        assert(ud_c->send_wr[i].num_sge == 1);\n        assert(ud_c->send_wr[i].opcode == IBV_WR_SEND);\n        assert(ud_c->send_wr[i].sg_list == &ud_c->send_sgl[sgl_idx]);\n\n        wings_ud_send_pkt_t* curr_send_pkt =\n            (wings_ud_send_pkt_t*)ud_c->send_sgl[sgl_idx].addr;\n        assert(curr_send_pkt->req_num > 0);\n      } else {\n        assert(ud_c->send_wr[i].num_sge == 0);\n        assert(ud_c->send_wr[i].sg_list->length == 0);\n        assert(ud_c->send_wr[i].opcode == IBV_WR_SEND_WITH_IMM);\n        if (ud_c->type == CRD) {\n          assert(((wings_crd_t*)&(ud_c->send_wr[i].imm_data))->crd_num > 0);\n          assert(((wings_crd_t*)&(ud_c->send_wr[i].imm_data))->sender_id ==\n                 ud_c->channel_id);\n        } else\n          assert(((wings_hdr_only_t*)&(ud_c->send_wr[i].imm_data))->sender_id ==\n                 ud_c->channel_id);\n      }\n\n      assert(ud_c->send_wr[i].wr.ud.remote_qkey == HRD_DEFAULT_QKEY);\n      assert(i == wr_idx - 1 || ud_c->send_wr[i].next == &ud_c->send_wr[i + 1]);\n      assert(!ud_c->is_inlining_enabled ||\n             ud_c->send_wr[i].send_flags == IBV_SEND_INLINE ||\n             ud_c->send_wr[i].send_flags ==\n                 (IBV_SEND_INLINE | IBV_SEND_SIGNALED));\n    }\n  }\n\n  if (WINGS_ENABLE_SEND_PRINTS &&\n      ud_c->enable_prints)  // TODO make this work w/ bcasts\n    colored_printf(CYAN,\n                   \">>> Send: %d packets %s %d (Total packets: %d, msgs: %d)\\n\",\n                   pkts_in_batch, ud_c->qp_name, msgs_in_batch,\n                   ud_c->stats.send_total_pkts, ud_c->stats.send_total_msgs);\n\n  ret = ibv_post_send(ud_c->qp, ud_c->send_wr, &bad_send_wr);\n  CPE(ret, \"ibv_post_send error while sending msgs to the NIC\", ret);\n}\n\nstatic inline void\n_wings_check_if_batch_n_inc_pkt_ptr(ud_channel_t* ud_c,\n                                    uint16_t* pkts_in_batch_ptr,\n                                    uint16_t* msgs_in_batch_ptr)\n{\n  (*pkts_in_batch_ptr)++;\n  uint16_t send_pkts = *pkts_in_batch_ptr;\n  uint16_t total_msgs_in_batch = *msgs_in_batch_ptr;\n  uint16_t max_pkt_batch =\n      ud_c->is_bcast_channel ? ud_c->max_pcie_bcast_batch : ud_c->max_send_wrs;\n\n  if (send_pkts == max_pkt_batch) {\n    _wings_batch_pkts_2_NIC(ud_c, send_pkts, total_msgs_in_batch);\n    *pkts_in_batch_ptr = 0;\n    *msgs_in_batch_ptr = 0;\n  }\n\n  _wings_inc_send_push_ptr(ud_c);  // go to the next pkt\n}\n\nstatic inline uint8_t\nwings_set_sender_id_n_msg_type(uint8_t sender_id, uint8_t is_small_msg)\n{\n  if (WINGS_ENABLE_ASSERTIONS) {\n    assert(sender_id < 128);\n    assert(is_small_msg == 0 || is_small_msg == 1);\n  }\n  return (is_small_msg == 0) ? sender_id + 128 : sender_id;\n}\n\nstatic inline uint8_t\n_wings_get_sender_id_n_msg_type(uint8_t skip_or_sender_id,\n                                uint8_t* is_small_msg)\n{\n  if (WINGS_ENABLE_ASSERTIONS) assert(skip_or_sender_id < 258);\n  *is_small_msg = (skip_or_sender_id >= 128) ? 0 : 1;\n  return (skip_or_sender_id >= 128) ? skip_or_sender_id - 128\n                                    : skip_or_sender_id;\n}\n\nstatic inline uint8_t\nwings_issue_pkts(ud_channel_t* ud_c, bit_vector_t* membership,\n                 uint8_t* input_array_of_elems, uint16_t input_array_len,\n                 uint16_t size_of_input_elems,\n                 uint16_t* input_array_rolling_idx,\n                 skip_input_elem_or_get_dst_id_t skip_or_get_sender_id_func_ptr,\n                 modify_input_elem_after_send_t modify_elem_after_send,\n                 copy_and_modify_input_elem_t copy_and_modify_elem)\n{\n  uint8_t curr_msg_dst;\n  uint8_t is_small_msg = 0;\n  uint8_t last_msg_dst = 255;\n  uint8_t has_outstanding_msgs = 0;\n  uint16_t msgs_in_batch = 0, pkts_in_batch = 0, idx = 0;\n\n  if (WINGS_ENABLE_ASSERTIONS)\n    assert(ud_c->is_header_only ||\n           _wings_curr_send_pkt_ptr(ud_c)->req_num == 0);\n\n  for (int i = 0; i < input_array_len; i++) {\n    idx = (uint16_t)(input_array_rolling_idx == NULL\n                         ? i\n                         : (i + *input_array_rolling_idx) % input_array_len);\n\n    // Skip or Respond (copy and send ?)\n    uint8_t* curr_elem = &input_array_of_elems[idx * size_of_input_elems];\n    int skip_or_sender_id = skip_or_get_sender_id_func_ptr(curr_elem);\n    if (skip_or_sender_id < 0) continue;\n\n    if (WINGS_ENABLE_ASSERTIONS) assert(skip_or_sender_id < 258);\n\n    curr_msg_dst =\n        _wings_get_sender_id_n_msg_type(skip_or_sender_id, &is_small_msg);\n    if (ud_c->is_header_only) is_small_msg = 1;\n\n    // Break if we do not have sufficient credits\n    if (!_wings_has_sufficient_crds_membership(ud_c, curr_msg_dst,\n                                               membership)) {\n      has_outstanding_msgs = 1;\n      if (ud_c->enable_stats) ud_c->stats.no_stalls_due_to_credits++;\n\n      if (input_array_rolling_idx != NULL) *input_array_rolling_idx = idx;\n      break;  // we need to break for broadcast (lets assume it is ok to break\n              // for unicasts as well since it may only harm perf)\n    }\n\n    _wings_dec_crds_membership(ud_c, curr_msg_dst, membership);\n\n    if ((!ud_c->is_bcast_channel && !ud_c->is_header_only) ||\n        is_small_msg == 0) {\n      // Send unicasts because if we cannot coalesce pkts, due to different\n      // endpoints\n      if (_wings_curr_send_pkt_ptr(ud_c)->req_num > 0 &&\n          (is_small_msg == 0 || curr_msg_dst != last_msg_dst))\n        _wings_check_if_batch_n_inc_pkt_ptr(ud_c, &pkts_in_batch,\n                                            &msgs_in_batch);\n    }\n\n    last_msg_dst = curr_msg_dst;\n\n    // Create the messages\n    _wings_forge_wr(ud_c, curr_msg_dst, curr_elem, pkts_in_batch,\n                    &msgs_in_batch, copy_and_modify_elem, is_small_msg);\n\n    modify_elem_after_send(curr_elem);  // E.g. Change the state of the element\n                                        // which triggered a send\n\n    // Check if we should send a batch since we might have reached the max batch\n    // size\n    if (is_small_msg == 0 || ud_c->is_header_only ||\n        _wings_curr_send_pkt_ptr(ud_c)->req_num == ud_c->max_coalescing) {\n      _wings_check_if_batch_n_inc_pkt_ptr(ud_c, &pkts_in_batch, &msgs_in_batch);\n    }\n  }\n\n  // Even if the last pkt is not full do the appropriate actions and incl to NIC\n  // batch\n  wings_ud_send_pkt_t* curr_pkt_ptr = NULL;\n\n  if (!ud_c->is_header_only && is_small_msg == 1) {\n    curr_pkt_ptr = _wings_curr_send_pkt_ptr(ud_c);\n    if (curr_pkt_ptr->req_num > 0 &&\n        curr_pkt_ptr->req_num < ud_c->max_coalescing)\n      pkts_in_batch++;\n  }\n\n  // Force a batch to send the last set of requests (even < max batch size)\n  if (pkts_in_batch > 0)\n    _wings_batch_pkts_2_NIC(ud_c, pkts_in_batch, msgs_in_batch);\n\n  if (!ud_c->is_header_only && is_small_msg == 1)\n    // Move to next packet and reset data left from previous bcasts/unicasts\n    if (curr_pkt_ptr->req_num > 0 &&\n        curr_pkt_ptr->req_num < ud_c->max_coalescing)\n      _wings_inc_send_push_ptr(ud_c);\n\n  return has_outstanding_msgs;\n}\n\nstatic inline void\nwings_issue_credits(\n    ud_channel_t* ud_c, bit_vector_t* membership, uint8_t* input_array_of_elems,\n    uint16_t input_array_len, uint16_t size_of_input_elems,\n    skip_input_elem_or_get_dst_id_t skip_or_get_sender_id_func_ptr,\n    modify_input_elem_after_send_t modify_elem_after_send)\n{\n  if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type == CRD);\n\n  for (int i = 0; i < ud_c->num_channels; ++i)\n    ud_c->no_crds_to_send_per_endpoint[i] = 0;\n\n  for (int i = 0; i < input_array_len; ++i) {\n    // Skip or Respond (copy and send ?)\n    uint8_t* curr_elem = &input_array_of_elems[i * size_of_input_elems];\n    int skip_or_sender_id = skip_or_get_sender_id_func_ptr(curr_elem);\n    if (WINGS_ENABLE_ASSERTIONS) assert(skip_or_sender_id < 255);\n\n    if (skip_or_sender_id < 0) continue;\n    uint8_t curr_msg_dst = (uint8_t)skip_or_sender_id;\n\n    // Check if we have sufficient credits --> (we should always have enough\n    // credits for CRDs)\n    if (!_wings_has_sufficient_crds_membership(ud_c, curr_msg_dst, membership))\n      assert(0);\n    if (ud_c->no_crds_to_send_per_endpoint[curr_msg_dst] == 0 &&\n        ud_c->credits_per_channels[curr_msg_dst] == 0)\n      assert(0);\n\n    _wings_dec_crds_membership(ud_c, curr_msg_dst, membership);\n\n    ud_c->no_crds_to_send_per_endpoint[curr_msg_dst]++;\n\n    modify_elem_after_send(curr_elem);  // E.g. Change the state of the element\n                                        // which triggered a send\n  }\n\n  uint16_t send_crd_packets = 0, total_credits_to_send = 0;\n  for (uint16_t i = 0; i < ud_c->num_channels; ++i) {\n    if (i == ud_c->channel_id) continue;\n\n    if (ud_c->no_crds_to_send_per_endpoint[i] > 0) {\n      _wings_forge_crd_wr(ud_c, i, send_crd_packets,\n                          ud_c->no_crds_to_send_per_endpoint[i]);\n      send_crd_packets++;\n      total_credits_to_send += ud_c->no_crds_to_send_per_endpoint[i];\n\n      if (send_crd_packets == ud_c->max_send_wrs) {\n        _wings_batch_pkts_2_NIC(ud_c, send_crd_packets, total_credits_to_send);\n        send_crd_packets = 0;\n        total_credits_to_send = 0;\n      }\n    }\n  }\n\n  if (send_crd_packets > 0)\n    _wings_batch_pkts_2_NIC(ud_c, send_crd_packets, total_credits_to_send);\n}\n\n#endif  // WINGS_INTERNAL_INLINES_H\n"
  },
  {
    "path": "include/wings/wings_api.h",
    "content": "//\n// Created by akatsarakis on 06/02/19.\n//\n\n#ifndef WINGS_API_H\n#define WINGS_API_H\n#include \"../utils/bit_vector.h\"\n#include \"hrd.h\"\n\n/// WARNING!!\n/// \tAccessible functions not defined below (in wings_api.h but exist only in\n/// wings.h) and starting with underscore\n///\t\t(i.e. \"_wings_*\") are internal and should not be called directly\n/// by the application\n\n#define WINGS_ENABLE_ASSERTIONS 0\n#define WINGS_MAX_SUPPORTED_INLINING 187\n#define WINGS_ENABLE_BATCH_POST_RECVS_TO_NIC 1\n\n#define WINGS_ENABLE_STAT_COUNTING 1\n\n#define WINGS_MIN_PCIE_BCAST_BATCH 1\n#define WINGS_MIN(x, y) (x < y ? x : y)\n\n#define WINGS_ENABLE_PRINTS 0\n#define WINGS_ENABLE_SS_PRINTS (1 && WINGS_ENABLE_PRINTS)\n#define WINGS_ENABLE_SEND_PRINTS (1 && WINGS_ENABLE_PRINTS)\n#define WINGS_ENABLE_RECV_PRINTS (1 && WINGS_ENABLE_PRINTS)\n#define WINGS_ENABLE_CREDIT_PRINTS (1 && WINGS_ENABLE_PRINTS)\n#define WINGS_ENABLE_POST_RECV_PRINTS (1 && WINGS_ENABLE_PRINTS)\n\n#define WINGS_IS_ROCE 0\n#define MAX_MTU_SIZE 4096\n\n/* Useful when `x = (x + 1) % N` is done in a loop */\n#define WINGS_MOD_ADD(x, N) \\\n  do {                      \\\n    x = x + 1;              \\\n    if (x == N) x = 0;      \\\n  } while (0)\n\n/* ah pointer and qpn are accessed together in the critical path\n   so we are putting them in the same cache line */\ntypedef struct {\n  struct ibv_ah* ah;\n  uint32_t qpn;\n  // no padding needed- false sharing is not an issue, only fragmentation\n} qp_info_t;\n\ntypedef struct {\n  uint8_t only_small_msgs : 1;  // support for up to 256 unique senders per\n                                // instance (e.g. thread)\n  uint8_t sender_id : 7;  // support for up to 128 unique senders per instance\n                          // (e.g. thread)\n  uint8_t req_num;        // <= max_coalescing of a channel\n  uint8_t reqs[];         // sizeof(req_num * req_size)\n} wings_pkt_t, wings_ud_send_pkt_t;\n\n// Packets with GRH\ntypedef struct {\n  struct ibv_grh grh;\n  wings_pkt_t pkt;\n} __attribute__((packed))\nwings_ud_recv_pkt_t;  // rcved rdma ud pkts come with a grh padding\n\ntypedef struct {\n  uint8_t sender_id;  // support for up to 256 unique senders per instance (e.g.\n                      // thread)\n  uint16_t crd_num;   // credit num\n} __attribute__((packed)) wings_crd_t;  // always send as inlined_payload\n\ntypedef struct {\n  uint8_t sender_id;  // support for up to 256 unique senders per instance (e.g.\n                      // thread)\n  uint8_t inlined_payload[3];  // available space to be used by the application\n} __attribute__((packed)) wings_hdr_only_t;  // always send as inlined_payload\n\nstatic_assert(sizeof(wings_hdr_only_t) == 4 * sizeof(uint8_t), \"\");\n\ntypedef struct {\n  uint64_t send_total_msgs;\n  uint64_t send_total_pkts;\n  uint64_t send_total_pcie_batches;\n\n  uint64_t ss_completions;\n  uint64_t recv_total_msgs;\n  uint64_t recv_total_pkts;\n\n  uint64_t\n      no_stalls_due_to_credits;  // number of stalls due to not enough credits\n} ud_channel_stats_t;\n\nenum channel_type { REQ, RESP, CRD };\n\ntypedef struct _ud_channel_t {\n  struct ibv_qp* qp;\n\n  enum channel_type type;\n  uint8_t max_coalescing;\n  uint8_t expl_crd_ctrl;\n  uint8_t disable_crd_ctrl;\n  uint8_t is_header_only;\n  uint8_t is_bcast_channel;\n  uint8_t is_inlining_enabled;\n  struct _ud_channel_t* channel_providing_crds;\n\n  char* qp_name;\n  uint16_t qp_id;  // id of qp in cb\n  uint16_t max_msg_size;\n  uint16_t small_msg_size;\n\n  uint8_t channel_id;     // id of the curr channel (e.g. local node id)\n  uint16_t num_channels;  // e.g. remote nodes + local node\n  uint16_t num_crds_per_channel;\n  uint16_t* credits_per_channels;  // array size of num_channels denoting\n                                   // available space on remote sides\n  /// Credits refer to msgs irrespective if coalesed or not --> a remote buffer\n  /// must be able to handle max_number_of_msgs * max_coalescing\n\n  volatile uint8_t* recv_pkt_buff;  /// Intermediate buffs where reqs are copied\n                                    /// when pkts are received\n  wings_ud_send_pkt_t* send_pkt_buff;  /// Intermediate buffs where reqs are\n                                       /// copied when pkts are send\n\n  uint16_t send_pkt_buff_len;\n  uint16_t recv_pkt_buff_len;\n\n  uint16_t max_send_wrs;\n  uint16_t max_recv_wrs;\n\n  uint16_t send_q_depth;\n  uint16_t recv_q_depth;\n\n  uint16_t ss_granularity;  // selective signaling granularity\n  uint16_t max_pcie_bcast_batch;\n\n  uint64_t total_pkts_send;  // used for selective signaling\n\n  int send_push_ptr;\n  int recv_push_ptr;\n  int recv_pull_ptr;\n\n  struct ibv_send_wr* send_wr;\n  struct ibv_recv_wr* recv_wr;  // Used only to batch post recvs to the NIC\n\n  struct ibv_sge* send_sgl;\n  struct ibv_sge* recv_sgl;  // Used only to batch post recvs to the NIC\n\n  struct ibv_cq* send_cq;\n  struct ibv_cq* recv_cq;\n  struct ibv_wc* recv_wc;  // (size of max_recv_wrs) Used on polling recv req cq\n                           // (only for immediates)\n\n  /// Send wcs are omitted since they are only used for selective signaling\n  /// (within send function calls)\n\n  struct ibv_mr* send_mem_region;  // NULL if inlining is enabled\n\n  struct ibv_pd* pd;  // A protection domain for this ud channel\n\n  // Remote QPs\n  qp_info_t* remote_qps;\n\n  // Used only for type == CRD\n  uint16_t* no_crds_to_send_per_endpoint;\n\n  // Stats\n  ud_channel_stats_t stats;\n\n  uint8_t enable_overflow_msgs;\n  uint8_t num_overflow_msgs;   // msgs in overflow_msg_buff always <=\n                               // max_coalescing - 1\n  uint8_t* overflow_msg_buff;  // use to keep message in case of polling\n                               // a pkt and it doesn't fit in the recv array we\n\n  // Toggles\n  uint8_t enable_stats;\n  uint8_t enable_prints;\n} ud_channel_t;\n\n// Define some function pointers used when issuing pkts\ntypedef void (*modify_input_elem_after_send_t)(uint8_t*);\ntypedef int (*skip_input_elem_or_get_dst_id_t)(\n    uint8_t*);  // Should return -1 to skip otherwise returns the sender id\ntypedef void (*copy_and_modify_input_elem_t)(uint8_t* msg_to_send,\n                                             uint8_t* triggering_req);\n\nstatic inline void\nwings_NOP_modify_elem_after_send(uint8_t* req)\n{ /*Do not change anything*/\n}\n\n/// Init and Util functions\nvoid wings_print_ud_c_overview(ud_channel_t* ud_c);\n\nvoid wings_ud_channel_destroy(\n    ud_channel_t* ud_c);  // This must be used to destroy all ud_c (both CRD and\n                          // typical ud_c)\n\n// This is used to int only non-CRDs channels (CRDs are initialized internally)\nvoid wings_ud_channel_init(ud_channel_t* ud_c, char* qp_name,\n                           enum channel_type type, uint8_t max_coalescing,\n                           uint16_t max_req_size, uint16_t small_req_size,\n                           uint8_t enable_inlining, uint8_t is_header_only,\n                           uint8_t is_bcast,\n                           // Credits\n                           uint8_t disable_crd_ctrl, uint8_t expl_crd_ctrl,\n                           ud_channel_t* linked_channel,\n                           uint16_t crds_per_channel, uint16_t num_channels,\n                           uint8_t channel_id,\n                           // Toggles\n                           uint8_t stats_on, uint8_t prints_on);\n\nvoid wings_setup_channel_qps_and_recvs(ud_channel_t** ud_c_array,\n                                       uint16_t ud_c_num,\n                                       dbit_vector_t* shared_rdy_var,\n                                       uint16_t worker_lid);\n\n/// Main functions\nstatic inline uint16_t wings_poll_buff_and_post_recvs(ud_channel_t* ud_c,\n                                                      uint16_t max_pkts_to_poll,\n                                                      uint8_t* recv_buff_space);\n\nstatic inline uint8_t wings_issue_pkts(\n    ud_channel_t* ud_c, bit_vector_t* membership, uint8_t* input_array_of_elems,\n    uint16_t input_array_len, uint16_t size_of_input_elems,\n    uint16_t* input_array_rolling_idx,\n    skip_input_elem_or_get_dst_id_t skip_or_get_sender_id_func_ptr,\n    modify_input_elem_after_send_t modify_elem_after_send,\n    copy_and_modify_input_elem_t copy_and_modify_elem);\n\nstatic inline void wings_issue_credits(\n    ud_channel_t* ud_c, bit_vector_t* membership, uint8_t* input_array_of_elems,\n    uint16_t input_array_len, uint16_t size_of_input_elems,\n    skip_input_elem_or_get_dst_id_t skip_or_get_sender_id_func_ptr,\n    modify_input_elem_after_send_t modify_elem_after_send);\n\n#endif  // WINGS_API_H\n"
  },
  {
    "path": "src/CR/crKV.c",
    "content": "//\n// Created by akatsarakis on 07/03/19.\n//\n\n#include <spacetime.h>\n#include <util.h>\n\n//////////////////////////////////////////////////\n////////////////////  Chain Replication / CRAQ KVS\n//////////////////////////////////////////////////\n\n//////////////////////////////////////////////////\n//////////// Helper functions ////////////////////\nstatic inline uint8_t\nhead_id()\n{\n  return 0;\n}\n\nstatic inline uint8_t\ntail_id()\n{\n  return machine_num - 1;\n}\n\n//////////// Assertion functions\nstatic inline void\ncr_assertions_inv(spacetime_inv_t* inv_ptr)\n{\n  assert(inv_ptr->op_meta.ts.version % 2 == 0);\n  assert(inv_ptr->op_meta.opcode == ST_OP_INV ||\n         inv_ptr->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE);\n  assert(inv_ptr->op_meta.val_len == ST_VALUE_SIZE);\n}\n\n//////////// Skip functions\n\nstatic inline uint8_t\ncr_skip_op(spacetime_op_t* op_ptr)\n{\n  return (uint8_t)((op_ptr->op_meta.state == ST_PUT_SUCCESS ||\n                    op_ptr->op_meta.state == ST_IN_PROGRESS_GET ||\n                    op_ptr->op_meta.state == ST_IN_PROGRESS_PUT)\n                       ? 1\n                       : 0);\n}\n\nstatic inline uint8_t\ncr_skip_inv(spacetime_inv_t* inv_ptr)\n{\n  return (uint8_t)(inv_ptr->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE ? 1 : 0);\n}\n\nstatic inline uint8_t\ncr_skip_ack(spacetime_ack_t* ack_ptr)\n{\n  return (uint8_t)(ack_ptr->opcode == ST_OP_MEMBERSHIP_CHANGE ? 1 : 0);\n}\n\nstatic inline uint8_t\ncr_skip_remote_reads(spacetime_op_t* op_ptr)\n{\n  return (uint8_t)((op_ptr->op_meta.state == ST_EMPTY) ? 1 : 0);\n}\n\nstatic inline uint8_t\ncr_skip_remote_writes(spacetime_op_t* op_ptr)\n{\n  return (uint8_t)((op_ptr->op_meta.state == ST_EMPTY ||\n                    op_ptr->op_meta.state == ST_PUT_SUCCESS ||\n                    op_ptr->op_meta.state == ST_IN_PROGRESS_PUT)\n                       ? 1\n                       : 0);\n}\n\n//////////// Exec functions\nstatic inline void\ncr_exec_write(spacetime_op_t* op_ptr, struct mica_op* kv_ptr)\n{\n  spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;\n  uint8_t* kv_value_ptr = (uint8_t*)&curr_meta[1];\n\n  if (ENABLE_ASSERTIONS) {\n    assert(machine_id == head_id());  // Only head must exec writes\n    assert(op_ptr->op_meta.opcode == ST_OP_PUT);\n    assert(op_ptr->op_meta.val_len == ST_VALUE_SIZE);\n  }\n\n  op_ptr->op_meta.state = ST_EMPTY;\n\n  cctrl_lock(&curr_meta->cctrl);\n  switch (curr_meta->state) {\n    case INVALID_STATE:\n      // Do not initiate a new write until you get to valid state\n      if (CR_ENABLE_BLOCKING_INVALID_WRITES_ON_HEAD) {\n        cctrl_unlock_dec_version(&curr_meta->cctrl);\n        op_ptr->op_meta.state = ST_PUT_STALL;\n        break;\n      }\n    case VALID_STATE:\n      curr_meta->state = INVALID_STATE;\n      memcpy(kv_value_ptr, op_ptr->value, ST_VALUE_SIZE);\n      kv_ptr->val_len = op_ptr->op_meta.val_len + sizeof(spacetime_object_meta);\n\n      cctrl_unlock_inc_version(&curr_meta->cctrl, (uint8_t)machine_id,\n                               (uint32_t*)&(op_ptr->op_meta.ts.version));\n\n      op_ptr->op_meta.state = ST_PUT_SUCCESS;\n      op_ptr->op_meta.ts.tie_breaker_id = (uint8_t)machine_id;\n      break;\n    default:\n      assert(0);\n  }\n}\n\nstatic inline void\ncr_exec_remote_reads(spacetime_op_t* op_ptr, struct mica_op* kv_ptr)\n{\n  if (ENABLE_ASSERTIONS) {\n    assert(machine_id == tail_id());\n    assert(op_ptr->op_meta.opcode == ST_OP_GET);\n  }\n\n  // the following variables used to validate atomicity between a lock-free read\n  // of an object\n  spacetime_object_meta prev_meta;\n  spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;\n  uint8_t* kv_value_ptr = (uint8_t*)&curr_meta[1];\n\n  do {\n    prev_meta = *curr_meta;\n    // switch template with all states\n    switch (curr_meta->state) {\n      case VALID_STATE:\n        memcpy(op_ptr->value, kv_value_ptr, ST_VALUE_SIZE);\n        op_ptr->op_meta.state = ST_GET_COMPLETE;\n        op_ptr->op_meta.val_len =\n            kv_ptr->val_len - sizeof(spacetime_object_meta);\n        break;\n      case INVALID_STATE:\n      default:\n        assert(0);\n    }\n  } while (\n      !cctrl_timestamp_is_same_and_valid(&prev_meta.cctrl, &curr_meta->cctrl));\n}\n\nstatic inline void\ncr_exec_op(spacetime_op_t* op_ptr, struct mica_op* kv_ptr, uint8_t idx)\n{\n  if (ENABLE_ASSERTIONS) assert(idx < max_batch_size);\n\n  // the following variables used to validate atomicity between a lock-free read\n  // of an object\n  spacetime_object_meta prev_meta;\n  spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;\n  uint8_t* kv_value_ptr = (uint8_t*)&curr_meta[1];\n\n  if (op_ptr->op_meta.opcode == ST_OP_GET) {\n    // Lock free reads through versioning (successful when version is even)\n    op_ptr->op_meta.state = ST_EMPTY;\n\n    do {\n      prev_meta = *curr_meta;\n      // switch template with all states\n      switch (curr_meta->state) {\n        case VALID_STATE:\n          memcpy(op_ptr->value, kv_value_ptr, ST_VALUE_SIZE);\n          op_ptr->op_meta.state = ST_GET_COMPLETE;\n          op_ptr->op_meta.val_len =\n              kv_ptr->val_len - sizeof(spacetime_object_meta);\n          break;\n        case INVALID_STATE:\n          if (ENABLE_ASSERTIONS)\n            assert(machine_id != tail_id());  // tail should always be valid\n          op_ptr->op_meta.state = ST_GET_STALL;\n          break;\n        default:\n          assert(0);\n      }\n    } while (!cctrl_timestamp_is_same_and_valid(&prev_meta.cctrl,\n                                                &curr_meta->cctrl));\n\n    if (op_ptr->op_meta.state == ST_GET_STALL) op_ptr->buff_idx = idx;\n\n  }\n\n  else if (op_ptr->op_meta.opcode == ST_OP_PUT) {\n    if (machine_id == head_id())  // if it is head\n      cr_exec_write(op_ptr, kv_ptr);\n    else\n      op_ptr->op_meta.state = ST_PUT_SUCCESS;\n\n    if (op_ptr->op_meta.state == ST_PUT_SUCCESS)\n      // Set idx that we cannot set while dispatching the req\n      op_ptr->buff_idx = idx;\n  }\n}\n\nstatic inline void\ncr_complete_local_write(spacetime_op_t* read_write_op, uint8_t idx,\n                        const uint64_t* key)\n{\n  /// completed read / write --> remove it from the ops buffer\n  if (ENABLE_ASSERTIONS) {\n    assert(read_write_op[idx].op_meta.state == ST_IN_PROGRESS_PUT);\n    assert(((uint64_t*)&read_write_op[idx].op_meta.key)[0] == key[0]);\n  }\n\n  if (read_write_op[idx].op_meta.opcode == ST_OP_PUT)\n    read_write_op[idx].op_meta.state = ST_PUT_COMPLETE;\n  else\n    assert(0);\n}\n\nstatic inline void\ncr_exec_inv(spacetime_inv_t* inv_ptr, struct mica_op* kv_ptr,\n            spacetime_op_t* read_write_op)\n{\n  // the following variables used to validate atomicity between a lock-free read\n  // of an object\n  spacetime_object_meta lock_free_meta;\n  spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;\n  uint8_t* kv_value_ptr = (uint8_t*)&curr_meta[1];\n  if (ENABLE_ASSERTIONS) assert(inv_ptr->op_meta.opcode == ST_OP_INV);\n\n  uint32_t debug_cntr = 0;\n  do {  // Lock free read of keys meta\n    if (ENABLE_ASSERTIONS) {\n      debug_cntr++;\n      if (debug_cntr == M_4) {\n        printf(\"Worker stuck on a lock-free read (for INV)\\n\");\n        debug_cntr = 0;\n      }\n    }\n    lock_free_meta = *curr_meta;\n  } while (!cctrl_timestamp_is_same_and_valid(&lock_free_meta.cctrl,\n                                              &curr_meta->cctrl));\n\n  // lock and proceed iff remote.TS >= local.TS\n  // inv TS >= local timestamp\n  if (!timestamp_is_smaller(inv_ptr->op_meta.ts.version,\n                            inv_ptr->op_meta.ts.tie_breaker_id,\n                            lock_free_meta.cctrl.ts.version,\n                            lock_free_meta.cctrl.ts.tie_breaker_id)) {\n    // Lock and check again if inv TS > local timestamp\n    cctrl_lock(&curr_meta->cctrl);\n    /// Warning: use op.version + 1 bellow since optik_lock() increases\n    /// curr_meta->version by 1\n    if (timestamp_is_smaller(\n            curr_meta->cctrl.ts.version - 1, curr_meta->cctrl.ts.tie_breaker_id,\n            inv_ptr->op_meta.ts.version, inv_ptr->op_meta.ts.tie_breaker_id)) {\n      //\t\t\t\t\t\t\tprintf(\"Received\n      // an invalidation with >= timestamp\\n\");\n      /// Update Value, TS and last_writer_id\n      //\t\t\t\tcurr_meta->last_writer_id =\n      // inv_ptr->op_meta.sender;\n      kv_ptr->val_len =\n          inv_ptr->op_meta.val_len + sizeof(spacetime_object_meta);\n      if (ENABLE_ASSERTIONS) {\n        //\t\t\t\t\tassert(kv_ptr->val_len ==\n        // KVS_VALUE_SIZE\n        //>> SHIFT_BITS);\n        assert(inv_ptr->op_meta.val_len == ST_VALUE_SIZE >> SHIFT_BITS);\n      }\n      memcpy(kv_value_ptr, inv_ptr->value, ST_VALUE_SIZE);\n      /// Update state\n\n      switch (curr_meta->state) {\n        case VALID_STATE:\n          if (machine_id != tail_id())  // Tail never gets invalid\n            curr_meta->state = INVALID_STATE;\n          break;\n        case INVALID_STATE:\n          break;\n        default:\n          assert(0);\n      }\n      cctrl_unlock_custom_version(&curr_meta->cctrl,\n                                  inv_ptr->op_meta.ts.tie_breaker_id,\n                                  inv_ptr->op_meta.ts.version);\n    } else if (timestamp_is_equal(curr_meta->cctrl.ts.version - 1,\n                                  curr_meta->cctrl.ts.tie_breaker_id,\n                                  inv_ptr->op_meta.ts.version,\n                                  inv_ptr->op_meta.ts.tie_breaker_id))\n      assert(0);\n    else\n      cctrl_unlock_dec_version(&curr_meta->cctrl);\n  }\n  inv_ptr->op_meta.opcode = ST_INV_SUCCESS;\n\n  if (inv_ptr->op_meta.initiator == machine_id && machine_id == tail_id())\n    cr_complete_local_write(read_write_op, inv_ptr->buff_idx,\n                            (uint64_t*)&inv_ptr->op_meta.key);\n\n  if (ENABLE_ASSERTIONS) assert(inv_ptr->op_meta.opcode == ST_INV_SUCCESS);\n}\n\nstatic inline void\ncr_exec_ack(spacetime_ack_t* ack_ptr, struct mica_op* kv_ptr,\n            spacetime_op_t* read_write_op)\n{\n  if (ENABLE_ASSERTIONS) assert(machine_id != tail_id());\n\n  // the following variables used to validate atomicity between a lock-free read\n  // of an object\n  spacetime_object_meta lock_free_read_meta;\n  spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;\n  if (ack_ptr->opcode != ST_OP_ACK) assert(0);\n\n  uint32_t debug_cntr = 0;\n  do {  // Lock free read of keys meta\n    if (ENABLE_ASSERTIONS) {\n      debug_cntr++;\n      if (debug_cntr == M_4) {\n        printf(\"Worker stuck on a lock-free read (for ACK)\\n\");\n        debug_cntr = 0;\n      }\n    }\n    lock_free_read_meta = *curr_meta;\n  } while (!cctrl_timestamp_is_same_and_valid(&lock_free_read_meta.cctrl,\n                                              &curr_meta->cctrl));\n\n  if (ENABLE_ASSERTIONS)\n    assert(!timestamp_is_smaller(lock_free_read_meta.cctrl.ts.version,\n                                 lock_free_read_meta.cctrl.ts.tie_breaker_id,\n                                 ack_ptr->ts.version,\n                                 ack_ptr->ts.tie_breaker_id));\n\n  if (timestamp_is_equal(ack_ptr->ts.version, ack_ptr->ts.tie_breaker_id,\n                         lock_free_read_meta.cctrl.ts.version,\n                         lock_free_read_meta.cctrl.ts.tie_breaker_id)) {\n    /// Lock and check again if ack TS == last local write\n    cctrl_lock(&curr_meta->cctrl);\n    if (timestamp_is_equal(ack_ptr->ts.version, ack_ptr->ts.tie_breaker_id,\n                           curr_meta->cctrl.ts.version - 1,\n                           curr_meta->cctrl.ts.tie_breaker_id)) {\n      switch (curr_meta->state) {\n        case INVALID_STATE:\n          curr_meta->state = VALID_STATE;\n          ack_ptr->opcode = ST_LAST_ACK_SUCCESS;\n          break;\n        case VALID_STATE:\n        default:\n          assert(0);\n      }\n    }\n    cctrl_unlock_dec_version(&curr_meta->cctrl);\n  }\n\n  if (machine_id == ack_ptr->initiator)\n    cr_complete_local_write(read_write_op, ack_ptr->buff_idx,\n                            (uint64_t*)&ack_ptr->key);\n\n  ack_ptr->opcode = ST_LAST_ACK_SUCCESS;\n}\n\n//////////// Dispatcher functions\n\nstatic inline uint8_t\ncr_skip_dispatcher(enum cr_type_t cr_type, void* ptr)\n{\n  switch (cr_type) {\n    case Local_ops:\n      return cr_skip_op(ptr);\n    case Invs:\n      return cr_skip_inv(ptr);\n    case Acks:\n      return cr_skip_ack(ptr);\n    case Remote_reads:\n      return cr_skip_remote_reads(ptr);\n    case Remote_writes:\n      return cr_skip_remote_writes(ptr);\n    default:\n      assert(0);\n  }\n}\n\nstatic inline void\ncr_assertions_dispatcher(enum cr_type_t cr_type, void* ptr)\n{\n  if (ENABLE_ASSERTIONS) switch (cr_type) {\n      case Invs:\n        cr_assertions_inv(ptr);\n      case Acks:\n      case Remote_writes:\n      case Local_ops:\n      case Remote_reads:\n        break;\n      default:\n        assert(0);\n    }\n}\n\nstatic inline void\ncr_exec_dispatcher(enum cr_type_t cr_type, void* op_ptr, struct mica_op* kv_ptr,\n                   uint8_t idx, spacetime_op_t* read_write_op)\n{\n  switch (cr_type) {\n    case Invs:\n      cr_exec_inv(op_ptr, kv_ptr, read_write_op);\n      break;\n    case Acks:\n      cr_exec_ack(op_ptr, kv_ptr, read_write_op);\n      break;\n    case Remote_writes:\n      cr_exec_write(op_ptr, kv_ptr);\n      break;\n    case Local_ops:\n      cr_exec_op(op_ptr, kv_ptr, idx);\n      break;\n    case Remote_reads:\n      cr_exec_remote_reads(op_ptr, kv_ptr);\n      break;\n    default:\n      assert(0);\n  }\n}\n\n//////////////////////////////////////////////////\n//////////// Batch function //////////////////////\nvoid\ncr_batch_ops_to_KVS(enum cr_type_t cr_type, uint8_t* op_array, int op_num,\n                    uint16_t sizeof_op_elem, spacetime_op_t* read_write_op)\n{\n#if SPACETIME_DEBUG == 1\n  // assert(kv.hash_table != NULL);\n  assert(op_array != NULL);\n  assert(op_num > 0 && op_num <= CACHE_BATCH_SIZE);\n  assert(resp != NULL);\n#endif\n\n#if SPACETIME_DEBUG == 2\n  for (I = 0; I < op_num; I++)\n    mica_print_op(&(*op_array)[I]);\n#endif\n  int key_in_store[CR_MAX_BATCH_SIZE];  // Is this key in the datastore?\n  unsigned int tag[CR_MAX_BATCH_SIZE];\n  //\tunsigned int bkt[CR_MAX_BATCH_SIZE];\n  uint64_t bkt[CR_MAX_BATCH_SIZE];\n  struct mica_bkt* bkt_ptr[CR_MAX_BATCH_SIZE];\n  struct mica_op* kv_ptr[CR_MAX_BATCH_SIZE];  // Ptr to KV item in log\n\n  if (ENABLE_ASSERTIONS) assert(read_write_op != NULL || cr_type != Acks);\n\n  // We first lookup the key in the datastore.\n  // The first two @I loops work for both GETs and PUTs.\n  for (int I = 0; I < op_num; I++) {\n    spacetime_op_meta_t* op_ptr =\n        (spacetime_op_meta_t*)&op_array[sizeof_op_elem * I];\n    cr_assertions_dispatcher(cr_type, op_ptr);\n    if (cr_skip_dispatcher(cr_type, op_ptr)) continue;\n\n    bkt[I] = op_ptr->key.bkt & kv.hash_table.bkt_mask;\n    bkt_ptr[I] = &kv.hash_table.ht_index[bkt[I]];\n    __builtin_prefetch(bkt_ptr[I], 0, 0);\n    tag[I] = op_ptr->key.tag;\n\n    key_in_store[I] = 0;\n    kv_ptr[I] = NULL;\n  }\n\n  for (int I = 0; I < op_num; I++) {\n    spacetime_op_meta_t* op_ptr =\n        (spacetime_op_meta_t*)&op_array[sizeof_op_elem * I];\n    if (cr_skip_dispatcher(cr_type, op_ptr)) continue;\n    for (int j = 0; j < 8; j++) {\n      if (bkt_ptr[I]->slots[j].in_use == 1 &&\n          bkt_ptr[I]->slots[j].tag == tag[I]) {\n        uint64_t log_offset =\n            bkt_ptr[I]->slots[j].offset & kv.hash_table.log_mask;\n        // We can interpret the log entry as mica_op, even though it\n        // may not contain the full MICA_MAX_VALUE value.\n        kv_ptr[I] = (struct mica_op*)&kv.hash_table.ht_log[log_offset];\n\n        // Small values (1--64 bytes) can span 2 cache lines\n        __builtin_prefetch(kv_ptr[I], 0, 0);\n        __builtin_prefetch((uint8_t*)kv_ptr[I] + 64, 0, 0);\n\n        // Detect if the head has wrapped around for this index entry\n        if (kv.hash_table.log_head - bkt_ptr[I]->slots[j].offset >=\n            kv.hash_table.log_cap) {\n          kv_ptr[I] = NULL;  // If so, we mark it \"not found\"\n        }\n\n        break;\n      }\n    }\n  }\n\n  for (int I = 0; I < op_num; I++) {\n    spacetime_op_meta_t* op_ptr =\n        (spacetime_op_meta_t*)&op_array[sizeof_op_elem * I];\n    if (cr_skip_dispatcher(cr_type, op_ptr)) continue;\n    if (kv_ptr[I] != NULL) {\n      // We had a tag match earlier. Now compare log entry.\n      long long* key_ptr_log = (long long*)kv_ptr[I];\n      long long* key_ptr_req = (long long*)&op_ptr->key;\n\n      if (key_ptr_log[1] == key_ptr_req[0]) {  // Key Found 8 Byte keys\n        key_in_store[I] = 1;\n        cr_exec_dispatcher(cr_type, op_ptr, kv_ptr[I], (uint8_t)I,\n                           read_write_op);\n      }\n    }\n\n    if (key_in_store[I] ==\n        0)  // KVS miss --> We get here if either tag or log key match failed\n      op_ptr->state = ST_MISS;\n  }\n\n  if (ENABLE_ASSERTIONS)\n    if (cr_type == Acks)\n      for (int I = 0; I < max_batch_size; I++)\n        assert(read_write_op[I].op_meta.opcode == ST_OP_GET ||\n               read_write_op[I].op_meta.state == ST_MISS ||\n               read_write_op[I].op_meta.state == ST_EMPTY ||\n               read_write_op[I].op_meta.state == ST_PUT_STALL ||\n               read_write_op[I].op_meta.state == ST_PUT_SUCCESS ||\n               read_write_op[I].op_meta.state == ST_PUT_COMPLETE ||\n               read_write_op[I].op_meta.state == ST_IN_PROGRESS_PUT ||\n               read_write_op[I].op_meta.state ==\n                   ST_OP_MEMBERSHIP_CHANGE ||  /// TODO check this\n               read_write_op[I].op_meta.state == ST_IN_PROGRESS_REPLAY);\n}\n"
  },
  {
    "path": "src/CR/cr_worker.c",
    "content": "#include <spacetime.h>\n#include <time.h>\n#include \"../../include/utils/concur_ctrl.h\"\n#include \"inline-util.h\"\n#include \"util.h\"\n\n///\n#include \"../../include/utils/time_rdtsc.h\"\n#include \"../../include/wings/wings.h\"\n///\n\nstatic inline uint8_t\nhead_id(void)\n{\n  return (uint8_t)0;\n}\n\nstatic inline uint8_t\ntail_id(void)\n{\n  return machine_num - 1;\n}\n\nstatic inline uint8_t\nnext_node_in_chain(void)\n{\n  return (uint8_t)((machine_id + 1) % machine_num);\n}\n\nstatic inline uint8_t\nprev_node_in_chain(void)\n{\n  return (uint8_t)(machine_id == 0 ? tail_id() : machine_id - 1);\n}\n\nint\ninv_skip_or_fwd_to_next_node(uint8_t* req)\n{\n  spacetime_inv_t* inv_req = (spacetime_inv_t*)req;\n  return inv_req->op_meta.opcode == ST_INV_SUCCESS\n             ? next_node_in_chain()\n             : -1;  // invs should only be fwded to next node\n}\n\nvoid\ninv_fwd_modify_elem_after_send(uint8_t* req)\n{\n  spacetime_inv_t* inv_req = (spacetime_inv_t*)req;\n\n  // empty inv buffer\n  if (inv_req->op_meta.opcode == ST_INV_SUCCESS ||\n      inv_req->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE)\n    inv_req->op_meta.opcode = ST_EMPTY;\n\n  else\n    assert(0);\n}\n\nvoid\ninv_fwd_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)\n{\n  spacetime_inv_t* inv_recv = (spacetime_inv_t*)triggering_req;\n  spacetime_inv_t* inv_to_send = (spacetime_inv_t*)msg_to_send;\n\n  // Copy op to inv and set opcode\n  memcpy(inv_to_send, inv_recv, sizeof(spacetime_inv_t));\n  inv_to_send->op_meta.opcode = ST_OP_INV;\n}\n\nint\ninv_skip_or_get_sender_id(uint8_t* req)\n{\n  spacetime_op_t* op_req = (spacetime_op_t*)req;\n\n  if (ENABLE_ASSERTIONS) {\n    assert(is_input_code(op_req->op_meta.opcode));\n    assert(is_response_code(op_req->op_meta.state) ||\n           is_bucket_state_code(op_req->op_meta.state));\n  }\n\n  return op_req->op_meta.state == ST_PUT_SUCCESS\n             ? next_node_in_chain()\n             : -1;  // since invs should only be fwded to next node\n}\n\nvoid\ninv_modify_elem_after_send(uint8_t* req)\n{\n  spacetime_op_t* op_req = (spacetime_op_t*)req;\n\n  if (op_req->op_meta.state == ST_PUT_SUCCESS)\n    op_req->op_meta.state = ST_IN_PROGRESS_PUT;\n  else\n    assert(0);\n}\n\nvoid\ninv_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)\n{\n  if (ENABLE_ASSERTIONS) assert(machine_id == head_id());\n\n  spacetime_op_t* op = (spacetime_op_t*)triggering_req;\n  spacetime_inv_t* inv_to_send = (spacetime_inv_t*)msg_to_send;\n\n  // Copy op to inv, set sender and opcode\n  memcpy(inv_to_send, op, sizeof(spacetime_inv_t));\n\n  inv_to_send->op_meta.opcode = ST_OP_INV;\n  inv_to_send->op_meta.initiator = (uint8_t)machine_id;\n}\n\nint\nremote_write_skip_or_get_sender_id(uint8_t* req)\n{\n  spacetime_op_t* op_req = (spacetime_op_t*)req;\n\n  if (ENABLE_ASSERTIONS) {\n    assert(is_input_code(op_req->op_meta.opcode));\n    assert(is_response_code(op_req->op_meta.state) ||\n           is_bucket_state_code(op_req->op_meta.state));\n  }\n\n  return op_req->op_meta.state == ST_PUT_SUCCESS\n             ? head_id()\n             : -1;  // send remote writes to head\n}\n\nvoid\nremote_write_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)\n{\n  if (ENABLE_ASSERTIONS) assert(machine_id != head_id());\n\n  spacetime_op_t* op = (spacetime_op_t*)triggering_req;\n  spacetime_inv_t* inv_to_send = (spacetime_inv_t*)msg_to_send;\n\n  // Copy op to inv, set sender and opcode\n  memcpy(inv_to_send, op, sizeof(spacetime_inv_t));\n\n  inv_to_send->op_meta.state = ST_NEW;\n  inv_to_send->op_meta.opcode = ST_OP_PUT;\n  inv_to_send->initiator = (uint8_t)machine_id;\n  inv_to_send->op_meta.initiator = (uint8_t)machine_id;\n}\n\nint\nremote_write_head_skip_or_get_sender_id(uint8_t* req)\n{\n  spacetime_op_t* op_req = (spacetime_op_t*)req;\n\n  if (ENABLE_ASSERTIONS) {\n    assert(machine_id == head_id());\n    assert(is_input_code(op_req->op_meta.opcode) ||\n           op_req->op_meta.opcode == ST_EMPTY);\n    assert(is_response_code(op_req->op_meta.state) ||\n           is_bucket_state_code(op_req->op_meta.state));\n  }\n\n  return op_req->op_meta.state == ST_PUT_SUCCESS\n             ? next_node_in_chain()\n             : -1;  // remote writes must always be fwded to head\n}\n\nvoid\nremote_write_head_copy_and_modify_elem(uint8_t* msg_to_send,\n                                       uint8_t* triggering_req)\n{\n  spacetime_op_t* op = (spacetime_op_t*)triggering_req;\n  spacetime_inv_t* inv_to_send = (spacetime_inv_t*)msg_to_send;\n\n  // Copy op to inv, set sender and opcode\n  memcpy(inv_to_send, op, sizeof(spacetime_inv_t));\n\n  inv_to_send->op_meta.opcode = ST_OP_INV;\n  inv_to_send->op_meta.initiator = op->initiator;\n}\n\nvoid\nremote_write_head_modify_elem_after_send(uint8_t* req)\n{\n  spacetime_op_t* op_req = (spacetime_op_t*)req;\n\n  if (op_req->op_meta.state == ST_PUT_SUCCESS)\n    op_req->op_meta.state = ST_SEND_CRD;\n  else\n    assert(0);\n}\n\nvoid\nack_fwd_modify_elem_after_send(uint8_t* req)\n{\n  spacetime_ack_t* ack_req = (spacetime_ack_t*)req;\n\n  if (ENABLE_ASSERTIONS) assert(ack_req->opcode == ST_LAST_ACK_SUCCESS);\n\n  ack_req->opcode = ST_EMPTY;\n}\n\nint\nack_fwd_skip_or_get_sender_id(uint8_t* req)\n{\n  spacetime_ack_t* ack_req = (spacetime_ack_t*)req;\n  if (ack_req->opcode == ST_ACK_SUCCESS) {\n    ack_req->opcode = ST_EMPTY;\n    return -1;\n  } else if (ack_req->opcode == ST_EMPTY)\n    return -1;\n\n  if (ENABLE_ASSERTIONS) assert(ack_req->opcode == ST_LAST_ACK_SUCCESS);\n\n  return prev_node_in_chain();\n}\n\nvoid\nack_fwd_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)\n{\n  spacetime_ack_t* ack_to_send = (spacetime_ack_t*)msg_to_send;\n  memcpy(ack_to_send, triggering_req,\n         sizeof(spacetime_ack_t));  // copy req to next_req_ptr\n\n  ack_to_send->opcode = ST_OP_ACK;\n}\n\nint\nack_skip_or_get_sender_id(uint8_t* req)\n{\n  spacetime_inv_t* inv_req = (spacetime_inv_t*)req;\n\n  if (ENABLE_ASSERTIONS)\n    assert(inv_req->op_meta.opcode == ST_INV_SUCCESS ||\n           inv_req->op_meta.opcode == ST_EMPTY);\n\n  return prev_node_in_chain();\n}\n\nvoid\nack_modify_elem_after_send(uint8_t* req)\n{\n  spacetime_inv_t* inv_req = (spacetime_inv_t*)req;\n\n  // empty inv buffer\n  if (inv_req->op_meta.opcode == ST_INV_SUCCESS ||\n      inv_req->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE)\n    inv_req->op_meta.opcode = ST_EMPTY;\n  else\n    assert(0);\n}\n\nvoid\nack_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)\n{\n  spacetime_ack_t* ack_to_send = (spacetime_ack_t*)msg_to_send;\n  spacetime_inv_t* inv_ptr = (spacetime_inv_t*)triggering_req;\n\n  memcpy(ack_to_send, inv_ptr,\n         sizeof(spacetime_ack_t));  // copy req to next_req_ptr\n\n  ack_to_send->opcode = ST_OP_ACK;\n  ack_to_send->buff_idx = inv_ptr->buff_idx;\n}\n\nint\nrem_write_crd_skip_or_get_sender_id(uint8_t* req)\n{\n  spacetime_op_t* op_ptr = (spacetime_op_t*)req;\n\n  if (ENABLE_ASSERTIONS)\n    assert(op_ptr->op_meta.state == ST_EMPTY ||\n           op_ptr->op_meta.state == ST_SEND_CRD ||\n           op_ptr->op_meta.state == ST_PUT_STALL ||\n           op_ptr->op_meta.state == ST_PUT_SUCCESS);\n\n  return op_ptr->op_meta.state == ST_SEND_CRD ? op_ptr->initiator : -1;\n}\n\nvoid\nrem_write_crd_modify_elem_after_send(uint8_t* req)\n{\n  spacetime_op_t* op = (spacetime_op_t*)req;\n\n  // empty inv buffer\n  if (op->op_meta.state == ST_SEND_CRD)\n    op->op_meta.state = ST_EMPTY;\n  else\n    assert(0);\n}\n\nint\ninv_crd_skip_or_get_sender_id(uint8_t* req)\n{\n  spacetime_inv_t* op_ptr = (spacetime_inv_t*)req;\n\n  if (ENABLE_ASSERTIONS)\n    assert(op_ptr->op_meta.opcode == ST_EMPTY ||\n           op_ptr->op_meta.opcode == ST_INV_SUCCESS);\n\n  return op_ptr->op_meta.opcode == ST_INV_SUCCESS ? prev_node_in_chain() : -1;\n}\n\nvoid\ninv_crd_modify_elem_after_send(uint8_t* req)\n{\n  if (ENABLE_ASSERTIONS) {\n    spacetime_inv_t* op = (spacetime_inv_t*)req;\n    assert(op->op_meta.opcode == ST_INV_SUCCESS);\n  }\n}\n\nint\nremote_read_skip_or_get_sender_id(uint8_t* req)\n{\n  spacetime_op_t* op_req = (spacetime_op_t*)req;\n\n  if (ENABLE_ASSERTIONS) {\n    assert(is_input_code(op_req->op_meta.opcode));\n    assert(is_response_code(op_req->op_meta.state) ||\n           is_bucket_state_code(op_req->op_meta.state));\n  }\n\n  return op_req->op_meta.state == ST_GET_STALL\n             ? tail_id()\n             : -1;  // send remote writes to head\n}\n\nvoid\nremote_read_modify_elem_after_send(uint8_t* req)\n{\n  spacetime_op_t* op_req = (spacetime_op_t*)req;\n\n  if (op_req->op_meta.state == ST_GET_STALL)\n    op_req->op_meta.state = ST_IN_PROGRESS_GET;\n  else\n    assert(0);\n}\n\nvoid\nremote_read_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)\n{\n  if (ENABLE_ASSERTIONS) assert(machine_id != tail_id());\n\n  spacetime_op_t* op = (spacetime_op_t*)triggering_req;\n  spacetime_op_t* op_to_send = (spacetime_op_t*)msg_to_send;\n\n  // Copy op to inv, set sender and opcode\n  memcpy(op_to_send, op, sizeof(spacetime_op_t));\n\n  op_to_send->op_meta.state = ST_NEW;\n  op_to_send->op_meta.opcode = ST_OP_GET;\n  op_to_send->initiator = (uint8_t)machine_id;\n  op_to_send->op_meta.initiator = (uint8_t)machine_id;\n}\n\nint\nremote_read_resp_skip_or_get_sender_id(uint8_t* req)\n{\n  spacetime_op_t* op_req = (spacetime_op_t*)req;\n\n  if (ENABLE_ASSERTIONS) {\n    if (op_req->op_meta.opcode != ST_OP_GET) {\n      printf(\"Opcode: %d, state: %d\\n\", op_req->op_meta.opcode,\n             op_req->op_meta.state);\n      printf(\"Opcode: %s, state: %s\\n\", code_to_str(op_req->op_meta.opcode),\n             code_to_str(op_req->op_meta.state));\n    }\n    assert(op_req->op_meta.opcode == ST_OP_GET);\n    assert(op_req->op_meta.state == ST_GET_COMPLETE);\n  }\n\n  return op_req->initiator;  // send remote writes to head\n}\n\nvoid\nremote_read_resp_modify_elem_after_send(uint8_t* req)\n{\n  spacetime_op_t* op_req = (spacetime_op_t*)req;\n\n  if (op_req->op_meta.state == ST_GET_COMPLETE)\n    op_req->op_meta.state = ST_EMPTY;\n  else {\n    printf(\"St_opcode: %s\\n\", code_to_str(op_req->op_meta.state));\n    assert(0);\n  }\n}\n\nvoid\nremote_read_resp_copy_and_modify_elem(uint8_t* msg_to_send,\n                                      uint8_t* triggering_req)\n{\n  if (ENABLE_ASSERTIONS) assert(machine_id == tail_id());\n\n  spacetime_op_t* op = (spacetime_op_t*)triggering_req;\n  spacetime_op_t* op_to_send = (spacetime_op_t*)msg_to_send;\n\n  // Copy op to inv, set sender and opcode\n  memcpy(op_to_send, op, sizeof(spacetime_op_t));\n}\n\nvoid\nprint_ops_and_remote_write_ops(spacetime_op_t* ops,\n                               spacetime_op_t* remote_writes)\n{\n  //\tfor(int i = 0; i < MAX_BATCH_KVS_OPS_SIZE; ++i)\n  for (int i = 0; i < max_batch_size; ++i)\n    printf(\"ops[%d]: state-> %s, key-> %lu \\n\", i,\n           code_to_str(ops[i].op_meta.state),\n           *((uint64_t*)&ops[i].op_meta.key));\n\n  if (machine_id == head_id())\n    //\t\tfor(int i = 0; i < MAX_BATCH_KVS_OPS_SIZE; ++i)\n    for (int i = 0; i < max_batch_size; ++i)\n      printf(\"remote_writes[%d]: state-> %s, key-> %lu \\n\", i,\n             code_to_str(remote_writes[i].op_meta.state),\n             *((uint64_t*)&remote_writes[i].op_meta.key));\n}\n\nvoid\nprint_total_stalls_due_to_credits(ud_channel_t* inv_ud_c,\n                                  ud_channel_t* ack_ud_c,\n                                  ud_channel_t* rem_writes_ud_c,\n                                  ud_channel_t* rem_reads_ud_c)\n{\n  // Stalls\n  colored_printf(GREEN, \"$$$ CRD STALLs : %s %d, %s %d, %s %d,\",\n                 inv_ud_c->qp_name, inv_ud_c->stats.send_total_msgs,\n                 ack_ud_c->qp_name, ack_ud_c->stats.send_total_msgs,\n                 rem_writes_ud_c->qp_name,\n                 rem_writes_ud_c->stats.send_total_msgs);\n  if (CR_ENABLE_REMOTE_READS)\n    colored_printf(GREEN, \", %s %d\\n\", rem_reads_ud_c->qp_name,\n                   rem_reads_ud_c->stats.send_total_msgs);\n  else\n    printf(\"\\n\");\n}\n\nvoid\nprint_total_send_recv_msgs_n_credits(\n    ud_channel_t* inv_ud_c, ud_channel_t* inv_crd_ud_c, ud_channel_t* ack_ud_c,\n    ud_channel_t* rem_writes_ud_c, ud_channel_t* crd_ud_c,\n    ud_channel_t* rem_reads_ud_c, ud_channel_t* rem_read_resp_ud_c)\n{\n  // Sends\n  colored_printf(GREEN, \"--> Total Send: %s %d\", inv_ud_c->qp_name,\n                 inv_ud_c->stats.send_total_msgs);\n  if (CR_ENABLE_EARLY_INV_CRDS)\n    colored_printf(GREEN, \", %s %d\", inv_crd_ud_c->qp_name,\n                   inv_crd_ud_c->stats.send_total_msgs);\n  colored_printf(GREEN, \", %s %d, %s %d, %s %d\", ack_ud_c->qp_name,\n                 ack_ud_c->stats.send_total_msgs, rem_writes_ud_c->qp_name,\n                 rem_writes_ud_c->stats.send_total_msgs, crd_ud_c->qp_name,\n                 crd_ud_c->stats.send_total_msgs);\n  if (CR_ENABLE_REMOTE_READS)\n    colored_printf(GREEN, \", %s %d, %s %d\\n\", rem_reads_ud_c->qp_name,\n                   rem_reads_ud_c->stats.send_total_msgs,\n                   rem_read_resp_ud_c->qp_name,\n                   rem_read_resp_ud_c->stats.send_total_msgs);\n  else\n    printf(\"\\n\");\n\n  // Receives\n  colored_printf(GREEN, \"vvv Total Recv: %s %d\", inv_ud_c->qp_name,\n                 inv_ud_c->stats.recv_total_msgs);\n  if (CR_ENABLE_EARLY_INV_CRDS)\n    colored_printf(GREEN, \", %s %d\", inv_crd_ud_c->qp_name,\n                   inv_crd_ud_c->stats.recv_total_msgs);\n  colored_printf(GREEN, \", %s %d, %s %d, %s %d\", ack_ud_c->qp_name,\n                 ack_ud_c->stats.recv_total_msgs, rem_writes_ud_c->qp_name,\n                 rem_writes_ud_c->stats.recv_total_msgs, crd_ud_c->qp_name,\n                 crd_ud_c->stats.recv_total_msgs);\n  if (CR_ENABLE_REMOTE_READS)\n    colored_printf(GREEN, \", %s %d, %s %d\\n\", rem_reads_ud_c->qp_name,\n                   rem_reads_ud_c->stats.recv_total_msgs,\n                   rem_read_resp_ud_c->qp_name,\n                   rem_read_resp_ud_c->stats.recv_total_msgs);\n  else\n    printf(\"\\n\");\n\n  // Credits\n  uint8_t remote_node =\n      (uint8_t)(machine_id == head_id() ? next_node_in_chain() : head_id());\n  printf(\"Inv credits: %d, ack credits: %d, remote_write_crds: %d\\n\",\n         inv_ud_c->credits_per_channels[remote_node],\n         ack_ud_c->credits_per_channels[remote_node],\n         rem_writes_ud_c->credits_per_channels[head_id()]);\n}\n\nstatic inline void\ncr_complete_local_reads(spacetime_op_t* remote_reads_resps,\n                        uint16_t remote_read_resps_polled, spacetime_op_t* ops)\n{\n  for (int i = 0; i < remote_read_resps_polled; ++i) {\n    uint16_t idx = remote_reads_resps[i].buff_idx;\n    /// completed read / write --> remove it from the ops buffer\n    if (ENABLE_ASSERTIONS) {\n      assert(ops[idx].op_meta.state == ST_IN_PROGRESS_GET);\n      assert(((uint64_t*)&ops[idx].op_meta.key)[0] ==\n             ((uint64_t*)&remote_reads_resps[i].op_meta.key)[0]);\n    }\n\n    if (ops[idx].op_meta.opcode == ST_OP_GET)\n      ops[idx].op_meta.state = ST_GET_COMPLETE;\n    else\n      assert(0);\n  }\n}\n\n// returns first free slot within a range [start_pos, end_pos) or -1 if all are\n// occupied\nstatic inline int\nget_first_free_slot(const uint8_t* free_slot_array, uint16_t start_pos,\n                    uint16_t end_pos)\n{\n  if (ENABLE_ASSERTIONS) assert(end_pos > start_pos);\n\n  for (int i = start_pos; i < end_pos; ++i)\n    if (free_slot_array[i] == 1) return i;\n  return -1;\n}\n\nstatic inline uint16_t\ncr_move_stalled_writes_to_top_n_return_free_space(spacetime_op_t* remote_writes)\n{\n  uint8_t free_slot_array[MAX_BATCH_KVS_OPS_SIZE] = {0};\n  uint16_t free_slots = 0;\n  uint16_t last_free_slot =\n      0;  // used to avoid re-iterating already non-empty slots\n  for (int i = 0; i < max_batch_size; ++i) {\n    if (ENABLE_ASSERTIONS)\n      assert(remote_writes[i].op_meta.state == ST_EMPTY ||\n             remote_writes[i].op_meta.state == ST_PUT_STALL ||\n             remote_writes[i].op_meta.state == ST_PUT_SUCCESS);\n\n    if (remote_writes[i].op_meta.state == ST_EMPTY) {\n      free_slots++;\n      free_slot_array[i] = 1;\n\n    } else if (free_slots > 0 &&\n               (remote_writes[i].op_meta.state == ST_PUT_STALL ||\n                remote_writes[i].op_meta.state == ST_PUT_SUCCESS)) {\n      int next_free_slot =\n          get_first_free_slot(free_slot_array, last_free_slot, (uint16_t)i);\n\n      if (next_free_slot > -1) {\n        free_slot_array[i] = 1;\n        free_slot_array[next_free_slot] = 0;\n        last_free_slot = (uint16_t)next_free_slot;\n        // swap stalled request to the first free slot\n        memcpy(&remote_writes[next_free_slot], &remote_writes[i],\n               sizeof(spacetime_op_t));\n\n        // empty this slot\n        remote_writes[i].op_meta.state = ST_EMPTY;\n        remote_writes[i].op_meta.opcode = ST_EMPTY;\n      }\n    }\n  }\n\n  if (ENABLE_ASSERTIONS)\n    for (int i = 0; i < max_batch_size; ++i) {\n      if (i < max_batch_size - free_slots)\n        assert(remote_writes[i].op_meta.state == ST_PUT_STALL ||\n               remote_writes[i].op_meta.state == ST_PUT_SUCCESS);\n      else\n        assert(remote_writes[i].op_meta.state == ST_EMPTY);\n    }\n\n  return free_slots;\n}\n\nstatic inline void\ndebugg(spacetime_op_t* ops, uint16_t worker_lid, int line_no)\n{\n  if (w_stats[worker_lid].total_loops > 0)\n    for (int i = 0; i < max_batch_size; ++i) {\n      if (!(ops[i].op_meta.opcode == ST_OP_PUT ||\n            ops[i].op_meta.opcode == ST_OP_GET))\n        printf(\"Line[%d]--> Op[%d]: %s, loop iter: %llu\\n\", line_no, i,\n               code_to_str(ops[i].op_meta.opcode),\n               w_stats[worker_lid].total_loops);\n      assert(ops[i].op_meta.opcode == ST_OP_PUT ||\n             ops[i].op_meta.opcode == ST_OP_GET);\n    }\n}\n\nvoid*\nrun_worker(void* arg)\n{\n  assert(rmw_ratio == 0);\n  assert(is_CR == 1);\n  assert(credits_num % machine_num == 0);  // CR ONLY\n  assert(ENABLE_COALESCE_OF_HOT_REQS == 0);\n\n  /// WARNING: only defines (no dynamically passed cli arguments) work for cr\n  /// worker\n  assert(max_coalesce <= MAX_REQ_COALESCE);\n  assert(num_workers <= MAX_WORKERS_PER_MACHINE);\n  assert(max_batch_size <= MAX_BATCH_KVS_OPS_SIZE);\n  assert(credits_num <= MAX_CREDITS_PER_REMOTE_WORKER_CR);\n  const uint16_t credit_num = MAX_CREDITS_PER_REMOTE_WORKER_CR;\n\n  struct thread_params params = *(struct thread_params*)arg;\n  uint16_t worker_lid = (uint16_t)params.id;  // Local ID of this worker thread\n  uint16_t worker_gid =\n      (uint16_t)(machine_id * num_workers +\n                 params.id);  // Global ID of this worker thread\n  // TODO check if the previous assignment (below is the correct one)\n  //    uint16_t worker_gid = (uint16_t) (machine_id * MAX_WORKERS_PER_MACHINE +\n  //    params.id);\t// Global ID of this worker thread\n\n  /* --------------------------------------------------------\n  ------------------- RDMA WINGS DECLARATIONS---------------\n  ---------------------------------------------------------*/\n  ud_channel_t ud_channels[CR_TOTAL_WORKER_UD_QPs];\n  ud_channel_t* ud_channel_ptrs[CR_TOTAL_WORKER_UD_QPs];\n\n  for (int i = 0; i < CR_TOTAL_WORKER_UD_QPs; ++i)\n    ud_channel_ptrs[i] = &ud_channels[i];\n\n  ud_channel_t* inv_ud_c = ud_channel_ptrs[CR_INV_UD_QP_ID];\n  ud_channel_t* inv_crd_ud_c = ud_channel_ptrs[CR_INV_CRD_UD_QP_ID];\n  ud_channel_t* ack_ud_c = ud_channel_ptrs[CR_ACK_UD_QP_ID];\n  ud_channel_t* rem_reads_ud_c = ud_channel_ptrs[CR_REMOTE_READS_UD_QP_ID];\n  ud_channel_t* rem_read_resp_ud_c =\n      ud_channel_ptrs[CR_REMOTE_READS_RESP_UD_QP_ID];\n  ud_channel_t* rem_writes_ud_c = ud_channel_ptrs[CR_REMOTE_WRITES_UD_QP_ID];\n  ud_channel_t* rem_writes_crd_ud_c =\n      ud_channel_ptrs[CR_REMOTE_WRITE_CRD_UD_QP_ID];\n\n  const uint8_t is_bcast = 0;\n  const uint8_t stats_on = 1;\n  const uint8_t prints_on = 1;\n  const uint8_t is_hdr_only = 0;\n  const uint8_t expl_crd_ctrl = 0;\n  const uint8_t disable_crd_ctrl = 0;\n\n  char inv_qp_name[200], ack_qp_name[200], rem_writes_qp_name[200],\n      rem_reads_qp_name[200], rem_read_resps_qp_name[200];\n  sprintf(inv_qp_name, \"%s[%d]\", \"\\033[31mINV\\033[0m\", worker_lid);\n  sprintf(ack_qp_name, \"%s[%d]\", \"\\033[33mACK\\033[0m\", worker_lid);\n  sprintf(rem_writes_qp_name, \"%s[%d]\", \"\\033[1m\\033[32mREMOTE_WRITES\\033[0m\",\n          worker_lid);\n  sprintf(rem_reads_qp_name, \"%s[%d]\", \"\\033[1m\\033[32mREMOTE_READS\\033[0m\",\n          worker_lid);\n  sprintf(rem_read_resps_qp_name, \"%s[%d]\",\n          \"\\033[1m\\033[32mREMOTE_READ_RESPS\\033[0m\", worker_lid);\n\n  uint8_t inv_inlining =\n      (DISABLE_INLINING == 0 &&\n       max_coalesce * sizeof(spacetime_inv_t) < WINGS_MAX_SUPPORTED_INLINING)\n          ? 1\n          : 0;\n  uint8_t ack_inlining =\n      (DISABLE_INLINING == 0 &&\n       max_coalesce * sizeof(spacetime_ack_t) < WINGS_MAX_SUPPORTED_INLINING)\n          ? 1\n          : 0;\n  uint8_t rem_writes_inlining = inv_inlining;\n  uint8_t rem_reads_inlining = inv_inlining;\n\n  if (CR_ENABLE_EARLY_INV_CRDS) {\n    wings_ud_channel_init(inv_ud_c, inv_qp_name, REQ, MAX_REQ_COALESCE,\n                          sizeof(spacetime_inv_t), 0, inv_inlining, is_hdr_only,\n                          is_bcast, disable_crd_ctrl, 1, inv_crd_ud_c,\n                          credit_num, machine_num, (uint8_t)machine_id,\n                          stats_on, prints_on);\n\n    wings_ud_channel_init(\n        ack_ud_c, ack_qp_name, RESP, MAX_REQ_COALESCE, sizeof(spacetime_ack_t),\n        0, ack_inlining, is_hdr_only, is_bcast, 1, expl_crd_ctrl, NULL,\n        CR_ACK_CREDITS, machine_num, (uint8_t)machine_id, stats_on, prints_on);\n  } else {\n    wings_ud_channel_init(inv_ud_c, inv_qp_name, REQ, MAX_REQ_COALESCE,\n                          sizeof(spacetime_inv_t), 0, inv_inlining, is_hdr_only,\n                          is_bcast, disable_crd_ctrl, expl_crd_ctrl, ack_ud_c,\n                          credit_num, machine_num, (uint8_t)machine_id,\n                          stats_on, prints_on);\n\n    wings_ud_channel_init(ack_ud_c, ack_qp_name, RESP, MAX_REQ_COALESCE,\n                          sizeof(spacetime_ack_t), 0, ack_inlining, is_hdr_only,\n                          is_bcast, disable_crd_ctrl, expl_crd_ctrl, inv_ud_c,\n                          credit_num, machine_num, (uint8_t)machine_id,\n                          stats_on, prints_on);\n  }\n\n  const uint16_t cr_remote_write_credits = credit_num / machine_num;\n  wings_ud_channel_init(\n      rem_writes_ud_c, rem_writes_qp_name, REQ, MAX_REQ_COALESCE,\n      sizeof(spacetime_op_t), 0, rem_writes_inlining, is_hdr_only, is_bcast,\n      disable_crd_ctrl, 1, rem_writes_crd_ud_c, cr_remote_write_credits,\n      machine_num, (uint8_t)machine_id, stats_on, prints_on);\n\n  ///////////////\n  ///< 4th stage>\n  if (CR_ENABLE_REMOTE_READS) {\n    wings_ud_channel_init(rem_reads_ud_c, rem_reads_qp_name, REQ,\n                          MAX_REQ_COALESCE, sizeof(spacetime_op_t), 0,\n                          rem_reads_inlining, is_hdr_only, is_bcast,\n                          disable_crd_ctrl, expl_crd_ctrl, rem_read_resp_ud_c,\n                          CR_REMOTE_READS_CREDITS, machine_num,\n                          (uint8_t)machine_id, stats_on, prints_on);\n\n    wings_ud_channel_init(rem_read_resp_ud_c, rem_read_resps_qp_name, RESP,\n                          MAX_REQ_COALESCE, sizeof(spacetime_op_t), 0,\n                          rem_reads_inlining, is_hdr_only, is_bcast,\n                          disable_crd_ctrl, expl_crd_ctrl, rem_reads_ud_c,\n                          CR_REMOTE_READS_CREDITS, machine_num,\n                          (uint8_t)machine_id, stats_on, prints_on);\n  }\n  ///</4th stage>\n  ///////////////\n\n  wings_setup_channel_qps_and_recvs(ud_channel_ptrs, CR_TOTAL_WORKER_UD_QPs,\n                                    g_share_qs_barrier, worker_lid);\n\n  /* -------------------------------------------------------\n  ------------------- OTHER DECLARATIONS--------------------\n  ---------------------------------------------------------*/\n  // Intermediate buffs where reqs are copied from incoming_* buffs in order to\n  // get passed to the KVS\n  spacetime_op_t* ops;\n  spacetime_inv_t* inv_recv_ops;\n  spacetime_ack_t* ack_recv_ops;\n  spacetime_val_t* val_recv_ops;  // UNUSED!\n  uint32_t coh_ops_len =\n      (uint32_t)(credits_num * machine_num *\n                 max_coalesce);  // credits * remote_machines * max_req_coalesce\n\n  setup_kvs_buffs(&ops, &inv_recv_ops, &ack_recv_ops, &val_recv_ops);\n\n  // Remote writes init\n  spacetime_op_t* remote_writes =\n      memalign(4096, max_batch_size * (sizeof(spacetime_op_t)));\n  memset(remote_writes, 0, max_batch_size * (sizeof(spacetime_op_t)));\n  for (int i = 0; i < max_batch_size; ++i) {\n    remote_writes[i].op_meta.state = ST_EMPTY;\n    remote_writes[i].op_meta.opcode = ST_EMPTY;\n  }\n\n  ///////////////\n  ///< 4th stage>\n  // Remote reads buffer: used for polling remote reads on tail & remote read\n  // responses on the rest nodes\n  spacetime_op_t* remote_reads =\n      memalign(4096, max_batch_size * (sizeof(spacetime_op_t)));\n  memset(remote_reads, 0, max_batch_size * (sizeof(spacetime_op_t)));\n  for (int i = 0; i < max_batch_size; ++i) {\n    remote_reads[i].op_meta.state = ST_EMPTY;\n    remote_reads[i].op_meta.opcode = ST_EMPTY;\n  }\n  ///</4th stage>\n  ///////////////\n\n  struct spacetime_trace_command* trace;\n  trace_init(&trace, worker_gid);\n\n  //// <UNUSED>\n  spacetime_op_t* n_hottest_keys_in_ops_get[COALESCE_N_HOTTEST_KEYS];\n  spacetime_op_t* n_hottest_keys_in_ops_put[COALESCE_N_HOTTEST_KEYS];\n  for (int i = 0; i < COALESCE_N_HOTTEST_KEYS; ++i) {\n    n_hottest_keys_in_ops_get[i] = NULL;\n    n_hottest_keys_in_ops_put[i] = NULL;\n  }\n  ////</UNUSED>\n\n  uint8_t has_outstanding_invs = 0;\n  uint8_t has_outstanding_rem_writes = 0;\n  uint32_t trace_iter = 0;\n  uint16_t rolling_idx = 0, remote_reads_rolling_idx = 0;\n  uint16_t invs_polled = 0, acks_polled = 0, remote_writes_polled = 0;\n  uint32_t num_of_iters_serving_op[MAX_BATCH_KVS_OPS_SIZE] = {0};\n\n  uint16_t free_rem_write_slots = max_batch_size;\n  /// Spawn stats thread\n  if (worker_lid == 0)\n    if (spawn_stats_thread() != 0)\n      colored_printf(RED, \"Stats thread was not successfully spawned \\n\");\n\n  struct timespec stopwatch_for_req_latency;\n  /* -----------------------------------------------------\n ------------------------Main Loop--------------------\n     ----------------------------------------------------- */\n  while (true) {\n    if (unlikely(w_stats[worker_lid].total_loops % M_16 == 0)) {\n      // Check something periodically\n      //\t        print_total_stalls_due_to_credits(inv_ud_c, ack_ud_c,\n      // rem_writes_ud_c, rem_reads_ud_c);\n      //\t\t\tprint_total_send_recv_msgs_n_credits(inv_ud_c,\n      // inv_crd_ud_c, ack_ud_c,\n      // rem_writes_ud_c, rem_writes_crd_ud_c,\n      // rem_reads_ud_c, rem_read_resp_ud_c);\n      // print_ops_and_remote_write_ops(ops, remote_writes);\n    }\n\n    /// DONE\n    // 1st stage: head only initiate requests\n    // [DONE] 2nd stage: + rest nodes initiate (local) reads\n    // [DONE] 3rd stage: + rest nodes initiate (remote) writes via head [DONE]\n    // 4th stage: + rest nodes initiate remote reads when invalid    [DONE]\n    // 5th stage: + add early INV credits to pipeline more reqs      [DONE]\n    // 6th stage: + poll for remote writes even though stalled exist [DONE]\n    // 7th stage: + poll for messages instead of pkts (ie if you have\n    //              empty space buff slots < max_coalesce poll pkt\n    //              and buffer additional packets                    [DONE]\n    // 8th stage: + Do not stall writes that found Invalid on head   [DONE]\n\n    if (!CR_ENABLE_ONLY_HEAD_REQS || machine_id == head_id()) {\n      refill_ops(&trace_iter, worker_lid, trace, ops, num_of_iters_serving_op,\n                 &stopwatch_for_req_latency, n_hottest_keys_in_ops_get,\n                 n_hottest_keys_in_ops_put);\n      cr_batch_ops_to_KVS(Local_ops, (uint8_t*)ops, max_batch_size,\n                          sizeof(spacetime_op_t), NULL);\n\n      // TODO: moved\n      stop_latency_of_completed_reads(ops, worker_lid,\n                                      &stopwatch_for_req_latency);\n    }\n\n    if (update_ratio > 0) {\n      if (machine_id == head_id()) {\n        const uint16_t max_outstanding_writes =\n            (machine_num - 1) * CR_ACK_CREDITS;\n\n        if (!CR_ENABLE_EARLY_INV_CRDS ||\n            inv_ud_c->stats.send_total_msgs - ack_ud_c->stats.recv_total_msgs <=\n                max_outstanding_writes) {  /// Initiate INVs for head writes\n          wings_issue_pkts(\n              inv_ud_c, NULL, (uint8_t*)ops, max_batch_size,\n              sizeof(spacetime_op_t), &rolling_idx, inv_skip_or_get_sender_id,\n              inv_modify_elem_after_send, inv_copy_and_modify_elem);\n        }\n\n        ///////////////\n        ///< 3rd stage>\n        if (!CR_ENABLE_ONLY_HEAD_REQS) {\n          wings_poll_buff_and_post_recvs(\n              rem_writes_ud_c, free_rem_write_slots,\n              (uint8_t*)&remote_writes[max_batch_size - free_rem_write_slots]);\n\n          cr_batch_ops_to_KVS(Remote_writes, (uint8_t*)remote_writes,\n                              max_batch_size, sizeof(spacetime_op_t), NULL);\n\n          if (!CR_ENABLE_EARLY_INV_CRDS ||\n              inv_ud_c->stats.send_total_msgs -\n                      ack_ud_c->stats.recv_total_msgs <=\n                  max_outstanding_writes) {  /// Initiate INVs for remotes\n                                             /// writes\n            wings_issue_pkts(inv_ud_c, NULL, (uint8_t*)remote_writes,\n                             max_batch_size, sizeof(spacetime_op_t), NULL,\n                             remote_write_head_skip_or_get_sender_id,\n                             remote_write_head_modify_elem_after_send,\n                             remote_write_head_copy_and_modify_elem);\n\n            /// Issue credits for remotes writes\n            wings_issue_credits(rem_writes_crd_ud_c, NULL,\n                                (uint8_t*)remote_writes, max_batch_size,\n                                sizeof(spacetime_op_t),\n                                rem_write_crd_skip_or_get_sender_id,\n                                rem_write_crd_modify_elem_after_send);\n          }\n\n          free_rem_write_slots =\n              cr_move_stalled_writes_to_top_n_return_free_space(remote_writes);\n        }\n\n      } else if (!CR_ENABLE_ONLY_HEAD_REQS)\n        /// Initiate Remote writes\n        wings_issue_pkts(rem_writes_ud_c, NULL, (uint8_t*)ops, max_batch_size,\n                         sizeof(spacetime_op_t), &rolling_idx,\n                         remote_write_skip_or_get_sender_id,\n                         inv_modify_elem_after_send,\n                         remote_write_copy_and_modify_elem);\n\n      ///</3rd stage>\n      ///////////////\n\n      ///////////////\n      ///< 4th stage>\n      if (CR_ENABLE_REMOTE_READS) {\n        if (machine_id == tail_id()) {\n          /// Poll Remote reads\n          uint16_t remote_reads_polled = wings_poll_buff_and_post_recvs(\n              rem_reads_ud_c, max_batch_size, (uint8_t*)remote_reads);\n\n          /// Batch Remote reads to KVS\n          cr_batch_ops_to_KVS(Remote_reads, (uint8_t*)remote_reads,\n                              remote_reads_polled, sizeof(spacetime_op_t),\n                              NULL);\n\n          /// Issue responses of Remote reads\n          wings_issue_pkts(rem_read_resp_ud_c, NULL, (uint8_t*)remote_reads,\n                           remote_reads_polled, sizeof(spacetime_op_t), NULL,\n                           remote_read_resp_skip_or_get_sender_id,\n                           remote_read_resp_modify_elem_after_send,\n                           remote_read_resp_copy_and_modify_elem);\n\n        } else {\n          /// Initiate Remote reads\n          wings_issue_pkts(rem_reads_ud_c, NULL, (uint8_t*)ops, max_batch_size,\n                           sizeof(spacetime_op_t), &remote_reads_rolling_idx,\n                           remote_read_skip_or_get_sender_id,\n                           remote_read_modify_elem_after_send,\n                           remote_read_copy_and_modify_elem);\n\n          for (int i = 0; i < max_batch_size; i++)\n            assert(ops[i].op_meta.opcode == ST_OP_PUT ||\n                   ops[i].op_meta.opcode == ST_OP_GET);\n\n          /// Poll respsonses of Remote reads\n          uint16_t remote_read_resps_polled = wings_poll_buff_and_post_recvs(\n              rem_read_resp_ud_c, max_batch_size, (uint8_t*)remote_reads);\n          /// Complete Remote reads\n          cr_complete_local_reads(remote_reads, remote_read_resps_polled, ops);\n          stop_latency_of_completed_reads(ops, worker_lid,\n                                          &stopwatch_for_req_latency);\n\n          for (int i = 0; i < max_batch_size; i++)\n            assert(ops[i].op_meta.opcode == ST_OP_PUT ||\n                   ops[i].op_meta.opcode == ST_OP_GET);\n        }\n      }\n      ///</4th stage>\n      ///////////////\n\n      if (machine_id != head_id()) {\n        /// Poll for INVs\n        if (has_outstanding_invs == 0) {\n          invs_polled = wings_poll_buff_and_post_recvs(inv_ud_c, coh_ops_len,\n                                                       (uint8_t*)inv_recv_ops);\n\n          if (invs_polled > 0) {\n            /// Batch INVs to KVS\n            cr_batch_ops_to_KVS(Invs, (uint8_t*)inv_recv_ops, invs_polled,\n                                sizeof(spacetime_inv_t), ops);\n\n            if (CR_ENABLE_EARLY_INV_CRDS)\n              /// Issue credits for INVs to previous node in chain\n              wings_issue_credits(inv_crd_ud_c, NULL, (uint8_t*)inv_recv_ops,\n                                  invs_polled, sizeof(spacetime_inv_t),\n                                  inv_crd_skip_or_get_sender_id,\n                                  inv_crd_modify_elem_after_send);\n          }\n        }\n\n        if (invs_polled > 0) {\n          /// Batch INVs to KVS\n          if (machine_id != tail_id() && machine_id != head_id())\n            /// Forward INVS to next node in chain\n            has_outstanding_invs = wings_issue_pkts(\n                inv_ud_c, NULL, (uint8_t*)inv_recv_ops, invs_polled,\n                sizeof(spacetime_inv_t), NULL, inv_skip_or_fwd_to_next_node,\n                inv_fwd_modify_elem_after_send, inv_fwd_copy_and_modify_elem);\n\n          else if (machine_id == tail_id()) {\n            /// Initiate ACKS (forward to prev)\n            has_outstanding_invs = wings_issue_pkts(\n                ack_ud_c, NULL, (uint8_t*)inv_recv_ops, invs_polled,\n                sizeof(spacetime_inv_t), NULL, ack_skip_or_get_sender_id,\n                ack_modify_elem_after_send, ack_copy_and_modify_elem);\n            if (ENABLE_ASSERTIONS)\n              assert(ack_ud_c->stats.send_total_msgs ==\n                     inv_ud_c->stats.recv_total_msgs -\n                         inv_ud_c->num_overflow_msgs);\n          }\n        }\n      }\n\n      if (machine_id != tail_id()) {\n        /// Poll for Acks\n        acks_polled = wings_poll_buff_and_post_recvs(ack_ud_c, coh_ops_len,\n                                                     (uint8_t*)ack_recv_ops);\n\n        if (acks_polled > 0) {\n          /// Batch ACKs to KVS\n          cr_batch_ops_to_KVS(Acks, (uint8_t*)ack_recv_ops, acks_polled,\n                              sizeof(spacetime_ack_t), ops);\n\n          stop_latency_of_completed_writes(ops, worker_lid,\n                                           &stopwatch_for_req_latency);\n        }\n\n        if (machine_id != head_id()) {\n          /// FWD ACKs to previous node if not the Head\n          wings_issue_pkts(\n              ack_ud_c, NULL, (uint8_t*)ack_recv_ops, acks_polled,\n              sizeof(spacetime_ack_t), NULL, ack_fwd_skip_or_get_sender_id,\n              ack_fwd_modify_elem_after_send, ack_fwd_copy_and_modify_elem);\n          if (ENABLE_ASSERTIONS)\n            assert(ack_ud_c->stats.send_total_msgs ==\n                   ack_ud_c->stats.recv_total_msgs -\n                       ack_ud_c->num_overflow_msgs);\n\n        } else  /// empty ack_rcv_ops in head node\n          for (int i = 0; i < coh_ops_len; ++i)\n            ack_recv_ops[i].opcode = ST_EMPTY;\n      }\n    }\n    w_stats[worker_lid].total_loops++;\n  }\n\n  return NULL;\n}\n"
  },
  {
    "path": "src/hades/hades.c",
    "content": "//\n// Created by akatsarakis on 12/02/19.\n//\n\n#include \"../../include/hades/hades.h\"\n#include <getopt.h>\n\ntypedef struct {\n  hades_view_t* ctx_last_local_view;\n  uint8_t dst_id;\n} hades_view_wrapper_w_dst_id_t;\n\nint\nhades_skip_or_get_dst_id(uint8_t* req)\n{\n  return ((hades_view_wrapper_w_dst_id_t*)req)->dst_id;\n}\n\nvoid\nhades_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)\n{\n  hades_view_wrapper_w_dst_id_t* last_local_view =\n      (hades_view_wrapper_w_dst_id_t*)triggering_req;\n  hades_view_t* send_hbt = (hades_view_t*)(msg_to_send - 1);\n\n  *send_hbt = *last_local_view->ctx_last_local_view;\n}\n\nint\nhades_crd_skip_or_get_sender_id(uint8_t* req)\n{\n  hades_view_t* req_hbt = (hades_view_t*)req;\n  return req_hbt->node_id;  // always send crd\n}\n\nstatic inline void\nprint_send_hbt(ud_channel_t* hbeat_c, hades_ctx_t* ctx)\n{\n  colored_printf(YELLOW, \"Send view[%lu]: {node %d, epoch_id %d} \",\n                 hbeat_c->stats.send_total_msgs,\n                 ctx->intermediate_local_view.node_id,\n                 ctx->intermediate_local_view.epoch_id);\n  bv_print_enhanced(ctx->curr_g_membership);\n  printf(\"\\n\");\n}\n\nstatic inline void\nprint_recved_hbts(ud_channel_t* hbeat_c, hades_view_t* hbt_array,\n                  uint16_t no_hbts)\n{\n  for (int i = 0; i < no_hbts; ++i) {\n    colored_printf(GREEN, \"Recved view[%lu]: {node %d, epoch_id %d} \",\n                   hbeat_c->stats.recv_total_msgs, hbt_array[i].node_id,\n                   hbt_array[i].epoch_id);\n    bv_print_enhanced(hbt_array[i].view);\n    printf(\"\\n\");\n  }\n}\n\nstatic inline uint8_t\nmajority_of_nodes(hades_ctx_t* ctx)\n{\n  assert(ctx->max_num_nodes > 1);\n  return (uint8_t)(ctx->max_num_nodes == 2 ? 2 : (ctx->max_num_nodes / 2) + 1);\n}\n\nstatic inline void\ncheck_if_majority_is_rechable(hades_ctx_t* h_ctx)\n{\n  if (bv_no_setted_bits(h_ctx->last_local_view.view) >=\n          majority_of_nodes(h_ctx) &&\n      bv_no_setted_bits(h_ctx->intermediate_local_view.view) <\n          majority_of_nodes(h_ctx)) {\n    colored_printf(RED, \"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\\n\");\n    colored_printf(RED, \"~ [HADES WARNING]: I cannot reach a majority ! ~\\n\");\n    colored_printf(RED, \"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\\n\");\n    colored_printf(YELLOW, \"Last membership (epoch %d): \",\n                   h_ctx->intermediate_local_view.epoch_id);\n    bv_print_enhanced(h_ctx->curr_g_membership);\n    colored_printf(YELLOW, \"My current view: \");\n    bv_print_enhanced(h_ctx->intermediate_local_view.view);\n    colored_printf(RED, \"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\\n\");\n  }\n}\n\nstatic inline uint8_t\nskip_to_apply_fake_link_failure(uint8_t node_id)\n{\n  static uint8_t ts_is_inited = 0;\n  static uint8_t link_has_failed = 0;\n  static struct timespec ts_fake_link_failure;\n\n  if ((machine_id == FAKE_LINK_FAILURE_NODE_A &&\n       node_id == FAKE_LINK_FAILURE_NODE_B) ||\n      (!FAKE_ONE_WAY_LINK_FAILURE && node_id == FAKE_LINK_FAILURE_NODE_A &&\n       machine_id == FAKE_LINK_FAILURE_NODE_B)) {\n    if (ts_is_inited == 0) {\n      get_rdtsc_timespec(&ts_fake_link_failure);\n      ts_is_inited = 1;\n    }\n\n    if (time_elapsed_in_sec(ts_fake_link_failure) >\n            FAKE_LINK_FAILURE_AFTER_SEC &&\n        time_elapsed_in_sec(ts_fake_link_failure) <\n            STOP_FAKE_LINK_FAILURE_AFTER_SEC) {\n      if (link_has_failed == 0) {\n        colored_printf(RED, \"%sLink failure between node %d and %d\\n\",\n                       FAKE_ONE_WAY_LINK_FAILURE ? \"One-way \" : \"\",\n                       FAKE_LINK_FAILURE_NODE_A, FAKE_LINK_FAILURE_NODE_B);\n        link_has_failed = 1;\n      }\n      return 1;\n    }\n  }\n  return 0;\n}\n\nstatic inline uint8_t\nis_in_membership(hades_ctx_t* h_ctx, uint8_t node_id)\n{\n  return bv_bit_get(h_ctx->curr_g_membership, node_id);\n}\n\n// Skip iterations for arbitration:\nstatic inline uint8_t\nskip_arbitration(hades_ctx_t* h_ctx, uint8_t i)\n{\n  if (i == machine_id) return 1;  // 1. my local machine id\n  if (!h_ctx->recved_views_flag[i])\n    return 1;  // 2. machine ids that I have not received a view\n  //    if(!is_in_membership(h_ctx, i)) return 1;   // 3. machine ids that are\n  //    not currently in the group membership\n  if (h_ctx->remote_recved_views[i].have_ostracised_for_dst_node == 1)\n    return 1;  // 3. this node has not already ostracise someone for me\n  if (!bv_bit_get(h_ctx->remote_recved_views[i]\n                      .view,  // 4. If my node id does not exist in their view\n                  machine_id))\n    return 1;\n  return 0;\n}\n\n// In case of a link failure (either both or one way) between nodes A and B.\n// Rest of nodes would be able to detect such a failure using its received views\n// and resolve this deterministically by choosing the one with the highest node\n// id to be expelled from the group membership. Once a node is voted to be\n// expelled by the majority of nodes it gets removed from the membership, this\n// method is inspired by the \"ostracism\" procedure under the Athenian democracy\n// in which any citizen could be expelled from the city of Athens for ten years.\n\n// If a node has ostracised somebody for me I cannot ostracised somebody for him\nstatic inline void\nview_arbitration_via_ostracism(hades_ctx_t* h_ctx)\n{\n  for (uint8_t i = 0; i < h_ctx->max_num_nodes; ++i)\n    h_ctx->have_ostracized_for[i] = 0;\n\n  for (uint8_t i = 0; i < h_ctx->max_num_nodes; ++i) {\n    if (skip_arbitration(h_ctx, i)) continue;\n\n    for (uint8_t j = 0; j < h_ctx->max_num_nodes; ++j) {\n      if (i >= j) continue;  // for efficiency we do not need to check those\n      if (skip_arbitration(h_ctx, j)) continue;\n\n      uint8_t i_view_of_j = bv_bit_get(h_ctx->remote_recved_views[i].view, j);\n      uint8_t j_view_of_i = bv_bit_get(h_ctx->remote_recved_views[j].view, i);\n\n      if (i_view_of_j == 0 || j_view_of_i == 0) {\n        // by default always ostracise this to the Max(i, j) --> j is always > i\n        // unless it's an one way failure from the opposite side where we have\n        // to ostracise i\n        uint8_t node_to_ostracise = i_view_of_j == 1 ? i : j;\n        uint8_t node_to_ostracised_for = i_view_of_j == 1 ? j : i;\n\n        h_ctx->recved_views_flag[node_to_ostracise] = 0;\n        h_ctx->have_ostracized_for[node_to_ostracised_for] = 1;\n        bv_bit_reset(&h_ctx->intermediate_local_view.view, node_to_ostracise);\n\n        //                yellow_printf(\"Ostracism: between nodes %d-%d --> %d\n        //                is ostracized\\n\", i, j, node_to_ostracise); printf(\"My\n        //                view: (epoch %d)\\n\",\n        //                h_ctx->intermediate_local_view.epoch_id);\n        //                bv_print_enhanced(h_ctx->intermediate_local_view.view);\n      }\n    }\n  }\n}\n\nstatic inline uint8_t\nget_max_received_epoch_id(hades_ctx_t* h_ctx)\n{\n  uint8_t max_epoch_id = 0;\n  for (int i = 0; i < h_ctx->max_num_nodes; ++i)\n    if (h_ctx->recved_views_flag[i] == 1 &&\n        h_ctx->remote_recved_views[i].epoch_id > max_epoch_id)\n      max_epoch_id = h_ctx->remote_recved_views[i].epoch_id;\n  return max_epoch_id;\n}\n\nstatic inline void\nupdate_view_n_membership(hades_ctx_t* h_ctx)\n{\n  if (time_elapsed_in_ms(h_ctx->ts_last_view_change) >\n      h_ctx->update_local_view_every_ms) {\n    get_rdtsc_timespec(&h_ctx->ts_last_view_change);  // Reset timer\n\n    uint8_t views_aggreeing = 1;  // (always agree with my local view)\n    uint8_t same_w_local_membership = 0;\n    uint16_t max_epoch_id = h_ctx->intermediate_local_view.epoch_id;\n\n    if (ENABLE_ARBITRATION) view_arbitration_via_ostracism(h_ctx);\n\n    // if view has changed update ctx\n    if (!bv_are_equal(h_ctx->intermediate_local_view.view,\n                      h_ctx->curr_g_membership) ||\n        get_max_received_epoch_id(h_ctx) >\n            h_ctx->intermediate_local_view.epoch_id) {\n      for (int i = 0; i < h_ctx->max_num_nodes; ++i) {\n        if (i == machine_id) continue;\n        if (h_ctx->recved_views_flag[i] == 0) continue;\n\n        if (bv_are_equal(h_ctx->intermediate_local_view.view,\n                         h_ctx->remote_recved_views[i].view)) {\n          views_aggreeing++;\n          if (max_epoch_id < h_ctx->remote_recved_views[i].epoch_id) {\n            max_epoch_id = h_ctx->remote_recved_views[i].epoch_id;\n            same_w_local_membership =\n                h_ctx->remote_recved_views[i].same_w_local_membership;\n          }\n        }\n        h_ctx->recved_views_flag[i] = 0;  // reset the received flag\n      }\n\n      if (views_aggreeing >= majority_of_nodes(h_ctx)) {\n        h_ctx->intermediate_local_view.epoch_id =\n            (uint8_t)(max_epoch_id + (same_w_local_membership == 1 ? 0 : 1));\n        bv_copy(&h_ctx->curr_g_membership, h_ctx->intermediate_local_view.view);\n\n        //                printf(\"Max epoch id: %d, same_w_local_membership:\n        //                %d\\n\",\n        //                        max_epoch_id, same_w_local_membership);\n        colored_printf(YELLOW, \"[HADES] MEMBERSHIP CHANGE --> [epoch %d], \",\n                       h_ctx->intermediate_local_view.epoch_id);\n        bv_print(h_ctx->curr_g_membership);\n        printf(\"\\n\");\n        //                bv_print_enhanced(h_ctx->curr_g_membership);\n      }\n    }\n\n    check_if_majority_is_rechable(h_ctx);\n\n    // update last local view\n    h_ctx->last_local_view = h_ctx->intermediate_local_view;\n    h_ctx->last_local_view.same_w_local_membership =\n        bv_are_equal(h_ctx->last_local_view.view, h_ctx->curr_g_membership);\n\n    // Reset local view\n    bv_reset_all(&h_ctx->intermediate_local_view.view);\n    bv_bit_set(&h_ctx->intermediate_local_view.view, (uint8_t)machine_id);\n  }\n}\n\nstatic inline void\nissue_heartbeats(hades_wings_ctx_t* hw_ctx)\n{\n  hades_ctx_t* h_ctx = &hw_ctx->ctx;\n  hades_view_wrapper_w_dst_id_t last_local_view;\n\n  last_local_view.ctx_last_local_view = &h_ctx->last_local_view;\n\n  for (uint8_t i = 0; i < h_ctx->max_num_nodes; ++i) {\n    h_ctx->last_local_view.have_ostracised_for_dst_node =\n        h_ctx->have_ostracized_for[i];\n    if (i == machine_id) continue;\n    if (FAKE_LINK_FAILURE && skip_to_apply_fake_link_failure(i)) continue;\n\n    last_local_view.dst_id = i;\n    if (time_elapsed_in_us(h_ctx->ts_last_send[i]) >\n        h_ctx->send_view_every_us) {\n      // Reset a tmp timer in case the send fails due to not enough crds\n      struct timespec ts_last_send_tmp;\n      get_rdtsc_timespec(&ts_last_send_tmp);\n      uint8_t send_failed = wings_issue_pkts(\n          hw_ctx->hviews_c, NULL, (uint8_t*)&last_local_view, 1,\n          sizeof(hades_view_wrapper_w_dst_id_t), NULL, hades_skip_or_get_dst_id,\n          wings_NOP_modify_elem_after_send, hades_copy_and_modify_elem);\n      if (!send_failed) h_ctx->ts_last_send[i] = ts_last_send_tmp;\n      //                print_send_hbt(hw_ctx->hviews_c, h_ctx);\n    }\n  }\n}\n\n// static inline\nvoid\nupdate_view_and_issue_hbs(hades_wings_ctx_t* hw_ctx)\n{\n  update_view_n_membership(&hw_ctx->ctx);\n\n  issue_heartbeats(hw_ctx);\n}\n\n// static inline\nuint16_t\npoll_for_remote_views(hades_wings_ctx_t* hw_ctx)\n{\n  hades_ctx_t* h_ctx = &hw_ctx->ctx;\n\n  // Poll for membership send\n  uint16_t views_polled = wings_poll_buff_and_post_recvs(\n      hw_ctx->hviews_c, h_ctx->max_views_to_poll, (uint8_t*)h_ctx->poll_buff);\n\n  //    print_recved_hbts(hw_ctx->hviews_c, h_ctx->poll_buff, views_polled);\n\n  for (int i = 0; i < views_polled; ++i) {\n    uint8_t sender_id = h_ctx->poll_buff[i].node_id;\n    h_ctx->recved_views_flag[sender_id] = 1;\n    h_ctx->remote_recved_views[sender_id] = h_ctx->poll_buff[i];\n    bv_bit_set(&h_ctx->intermediate_local_view.view, sender_id);\n\n    // In case somebody tries to rejoin\n    if (h_ctx->last_local_view.epoch_id > 1)\n      if (h_ctx->poll_buff[i].epoch_id == 0 &&\n          hw_ctx->hviews_c->credits_per_channels[sender_id] == 0) {\n        /// Need to reset its credits and reconfigure the qps to start sending\n        /// views again Warning: currently we share qp info via memcache so if\n        /// node storing memcache (e.g. houston)\n        ///         fails we cannot make him re-join (prev qp info are lost)\n        printf(\"Resetting credits and reconfiguring ibv_qps for channel: %d\\n\",\n               sender_id);\n        wings_reset_credits(hw_ctx->hviews_c, sender_id);\n        wings_reconfigure_wrs_ah(hw_ctx->hviews_c, sender_id);\n      }\n  }\n\n  wings_issue_credits(hw_ctx->hviews_crd_c, NULL, (uint8_t*)h_ctx->poll_buff,\n                      views_polled, sizeof(hades_view_t),\n                      hades_crd_skip_or_get_sender_id,\n                      wings_NOP_modify_elem_after_send);\n\n  return views_polled;\n}\n\nvoid*\nhades_loop_only_thread(void* hades_wings_ctx)\n{\n  hades_wings_ctx_t* hw_ctx = hades_wings_ctx;\n\n  uint64_t no_iters = 0;\n  while (true) {\n    /// Print every X iteration (Mainly for dbging)\n    no_iters++;\n    if (no_iters % M_32 == 0) {\n      //            printf(\"My view: (epoch %d)\\n\",\n      //            hw_ctx->ctx.intermediate_local_view.epoch_id);\n      //            bv_print_enhanced(hw_ctx->ctx.intermediate_local_view.view);\n    }\n\n    /// Main loop\n    update_view_and_issue_hbs(hw_ctx);\n\n    poll_for_remote_views(hw_ctx);\n  }\n}\n\nvoid*\nhades_full_thread(void* node_id)\n{\n  //////////////////////////////////\n  /// failure detector context init\n  //////////////////////////////////\n\n  /// Wings (rdma communication) init\n  ud_channel_t* ud_c_ptrs[2];\n  ud_channel_t ud_channels[2];\n\n  for (int i = 0; i < 2; ++i)\n    ud_c_ptrs[i] = &ud_channels[i];\n\n  ud_channel_t* hviews_c = ud_c_ptrs[0];\n  ud_channel_t* hviews_crd_c = ud_c_ptrs[1];\n\n  // other Vars\n  uint8_t machine_num = 3;\n  uint16_t worker_lid = 0;\n  uint16_t max_views_to_poll = 10;\n  uint32_t send_view_every_us = 100;\n  uint32_t update_local_view_ms = 10;\n\n  uint8_t _node_id = *((uint8_t*)node_id);\n\n  hades_wings_ctx_t w_ctx;\n  hades_wings_ctx_init(&w_ctx, _node_id, machine_num, max_views_to_poll,\n                       send_view_every_us, update_local_view_ms, hviews_c,\n                       hviews_crd_c, worker_lid);\n\n  wings_setup_channel_qps_and_recvs(ud_c_ptrs, 2, NULL, 0);\n\n  hades_loop_only_thread(&w_ctx);\n\n  return NULL;\n}\n"
  },
  {
    "path": "src/hades/test.c",
    "content": "//\n// Created by akatsarakis on 21/05/19.\n//\n\n#include <getopt.h>\n#include \"../../include/hades/hades.h\"\n\nint\nmain(int argc, char* argv[])\n{\n  machine_id = -1;\n\n  static struct option opts[] = {\n      {.name = \"machine-id\", .has_arg = 1, .val = 'm'},\n      {.name = \"dev-name\", .has_arg = 1, .val = 'd'},\n      {0}};\n\n  /* Parse and check arguments */\n  while (1) {\n    int c = getopt_long(argc, argv, \"m:d:\", opts, NULL);\n    if (c == -1) {\n      break;\n    }\n    switch (c) {\n      case 'm':\n        machine_id = atoi(optarg);\n        break;\n      case 'd':\n        memcpy(dev_name, optarg, strlen(optarg));\n        break;\n      default:\n        printf(\"Invalid argument %d\\n\", c);\n        assert(false);\n    }\n  }\n\n  hades_full_thread(&machine_id);\n}\n"
  },
  {
    "path": "src/hermes/hermesKV.c",
    "content": "//\n// Created by akatsarakis on 07/03/19.\n//\n\n#include <inline-util.h>\n#include <spacetime.h>\n\n//////////////////////////////////////////////////\n/////////////////////// HERMES KVS (SPACETIME)\n//////////////////////////////////////////////////\n\n//////////// Assertion functions\n\nstatic inline void\nhermes_assertions_begin_inv(spacetime_inv_t* inv_ptr)\n{\n  assert(inv_ptr->op_meta.ts.version % 2 == 0);\n  assert(inv_ptr->op_meta.opcode == ST_OP_INV ||\n         inv_ptr->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE);\n  assert(inv_ptr->op_meta.val_len == (ST_VALUE_SIZE >> SHIFT_BITS));\n  assert(remote_machine_num != 1 ||\n         inv_ptr->op_meta.sender == remote_machine_num - machine_id);\n  assert(remote_machine_num != 1 ||\n         inv_ptr->op_meta.ts.tie_breaker_id == remote_machine_num - machine_id);\n  //\t\t\tred_printf(\"INVs: Ops[%d]vvv hash(1st 8B):%\" PRIu64 \"\n  // version: %d, tie: %d\\n\", I,\n  //\t\t\t\t\t   ((uint64_t *) &(*op)[I].key)[0],\n  //(*op)[I].version,\n  //(*op)[I].tie_breaker_id);\n}\n\nstatic inline void\nhermes_assertions_begin_ack(spacetime_ack_t* ack_ptr)\n{\n  assert(ack_ptr->ts.version % 2 == 0);\n  assert(remote_machine_num != 1 ||\n         ack_ptr->sender == remote_machine_num - machine_id);\n  assert(ack_ptr->opcode == ST_OP_ACK || ack_ptr->opcode == ST_OP_INV_ABORT ||\n         ack_ptr->opcode == ST_OP_MEMBERSHIP_CHANGE);\n\n  /// WARNING the following assertion is incorrect for write replays\n  //\tassert(group_membership.num_of_alive_remotes != MAX_REMOTE_MACHINES ||\n  //\t       ack_ptr->opcode == ST_OP_INV_ABORT ||\n  //\t\t   ack_ptr->ts.tie_breaker_id == machine_id ||\n  //\t\t   (ENABLE_VIRTUAL_NODE_IDS && ack_ptr->ts.tie_breaker_id  %\n  // MAX_MACHINE_NUM == machine_id));\n\n  //\t\t\tyellow_printf(\"ACKS: Ops[%d]vvv hash(1st 8B):%\" PRIu64 \"\n  // version: %d, tie: %d\\n\", I,\n  //\t\t\t\t\t   ((uint64_t *) &(*op)[I].key)[0],\n  //(*op)[I].version,\n  //(*op)[I].tie_breaker_id);\n}\n\nstatic inline void\nhermes_assertions_begin_val(spacetime_val_t* val_ptr)\n{\n  assert(val_ptr->ts.version % 2 == 0);\n  assert(val_ptr->opcode == ST_OP_VAL);\n  assert(remote_machine_num != 1 ||\n         val_ptr->sender == remote_machine_num - machine_id);\n  assert(remote_machine_num != 1 ||\n         val_ptr->ts.tie_breaker_id == remote_machine_num - machine_id);\n  //\t\t\tgreen_printf(\"VALS: Ops[%d]vvv hash(1st 8B):%\" PRIu64 \"\n  // version: %d, tie: %d\\n\", I,\n  //\t\t\t\t\t   ((uint64_t *) &(*op)[I].key)[0],\n  //(*op)[I].version,\n  //(*op)[I].tie_breaker_id);\n}\n\nstatic inline void\nhermes_assertions_end_read_write_ops(spacetime_op_t* read_write_op)\n{\n  for (int i = 0; i < max_batch_size; ++i)\n    assert(read_write_op[i].op_meta.opcode == ST_OP_GET ||\n           read_write_op[i].op_meta.state == ST_MISS ||\n           read_write_op[i].op_meta.state == ST_PUT_STALL ||\n           read_write_op[i].op_meta.state == ST_PUT_SUCCESS ||\n           read_write_op[i].op_meta.state == ST_PUT_COMPLETE ||\n           read_write_op[i].op_meta.state == ST_IN_PROGRESS_PUT ||\n           read_write_op[i].op_meta.state == ST_RMW_STALL ||\n           read_write_op[i].op_meta.state == ST_RMW_ABORT ||\n           read_write_op[i].op_meta.state == ST_RMW_SUCCESS ||\n           read_write_op[i].op_meta.state == ST_RMW_COMPLETE ||\n           read_write_op[i].op_meta.state == ST_IN_PROGRESS_RMW ||\n           read_write_op[i].op_meta.state ==\n               ST_OP_MEMBERSHIP_CHANGE ||  /// TODO check this\n           read_write_op[i].op_meta.state == ST_IN_PROGRESS_REPLAY);\n}\n\n/// Helper functions\n\n// TODO inlining this function by hand can give higher xPut ~5% on 20% write\n// rate\nstatic inline __attribute__((always_inline)) void\nhermes_lock_free_read_obj_meta(spacetime_object_meta* lock_free_read_meta,\n                               spacetime_object_meta* curr_meta)\n{\n  uint32_t debug_cntr = 0;\n  do {  // Lock free read of keys meta\n    if (ENABLE_ASSERTIONS) {\n      debug_cntr++;\n      if (debug_cntr == M_4) {\n        printf(\"Worker stuck on a lock-free read (for ACK)\\n\");\n        debug_cntr = 0;\n      }\n    }\n    *lock_free_read_meta = *curr_meta;\n  } while (!cctrl_timestamp_is_same_and_valid(&lock_free_read_meta->cctrl,\n                                              &curr_meta->cctrl));\n}\n\nstatic uint64_t g_seed = 0xdeadbeef;\nstatic inline void\nhermes_update_actions_n_unlock(spacetime_op_t* op_ptr, struct mica_op* kv_ptr,\n                               spacetime_object_meta* curr_meta, uint8_t idx,\n                               spacetime_group_membership curr_membership,\n                               uint8_t RMW_flag)\n{\n  if (ENABLE_ASSERTIONS) {\n    assert(RMW_flag == 0 || ENABLE_RMWs);\n    assert(idx < ST_OP_BUFFER_INDEX_EMPTY);\n  }\n\n  /// Copy value and update len\n  uint8_t* kv_value_ptr = (uint8_t*)&curr_meta[1];\n  memcpy(kv_value_ptr, op_ptr->value, ST_VALUE_SIZE);\n  kv_ptr->val_len = set_val_len(&op_ptr->op_meta);\n\n  /// update keys metadata and unlock\n  curr_meta->RMW_flag = RMW_flag;\n  curr_meta->state = WRITE_STATE;\n  curr_meta->op_buffer_index = (uint8_t)idx;\n  curr_meta->last_local_write_ts.version =\n      curr_meta->cctrl.ts.version + (!ENABLE_RMWs || RMW_flag == 1 ? 1 : 3);\n  // update group membership mask\n  bv_copy((bit_vector_t*)&curr_meta->ack_bv, curr_membership.w_ack_init);\n\n  uint8_t v_node_id =\n      (uint8_t)(!ENABLE_VIRTUAL_NODE_IDS\n                    ? machine_id\n                    : machine_id + machine_num * (hrd_fastrand(&g_seed) %\n                                                  VIRTUAL_NODE_IDS_PER_NODE));\n  curr_meta->last_local_write_ts.tie_breaker_id = v_node_id;\n\n  if (!ENABLE_RMWs || RMW_flag == 1)\n    cctrl_unlock_inc_version(&curr_meta->cctrl, v_node_id,\n                             (uint32_t*)&(op_ptr->op_meta.ts.version));\n  else\n    cctrl_unlock_inc_version_by_three(&curr_meta->cctrl, v_node_id,\n                                      (uint32_t*)&(op_ptr->op_meta.ts.version));\n\n  /// update op_ptr metadata\n  op_ptr->RMW_flag = RMW_flag;\n  op_ptr->op_meta.state = RMW_flag == 1 ? ST_RMW_SUCCESS : ST_PUT_SUCCESS;\n  op_ptr->op_meta.ts.tie_breaker_id = v_node_id;\n}\n\nstatic inline void\nhermes_local_state_to_op(spacetime_op_t* op_ptr,\n                         spacetime_object_meta* keys_meta)\n{\n  uint8_t* kv_value_ptr = (uint8_t*)&keys_meta[1];\n  op_ptr->RMW_flag = keys_meta->RMW_flag;\n  op_ptr->op_meta.state = ST_REPLAY_SUCCESS;\n  op_ptr->op_meta.ts.version = keys_meta->cctrl.ts.version - 1;\n  op_ptr->op_meta.ts.tie_breaker_id = keys_meta->cctrl.ts.tie_breaker_id;\n  op_ptr->op_meta.val_len = ST_VALUE_SIZE >> SHIFT_BITS;\n  memcpy(op_ptr->value, kv_value_ptr, ST_VALUE_SIZE);\n}\n\nstatic inline void\nhermes_write_replay_actions(spacetime_op_t* op_ptr, uint8_t idx,\n                            spacetime_object_meta* keys_meta,\n                            spacetime_group_membership curr_membership)\n{\n  if (ENABLE_ASSERTIONS) assert(idx < ST_OP_BUFFER_INDEX_EMPTY);\n\n  colored_printf(YELLOW, \"Write replay for i: %d\\n\", idx);\n\n  /// update keys metadata and unlock\n  keys_meta->state = REPLAY_STATE;\n  keys_meta->op_buffer_index = (uint8_t)idx;\n  keys_meta->last_local_write_ts.version = keys_meta->cctrl.ts.version - 1;\n  keys_meta->last_local_write_ts.tie_breaker_id =\n      keys_meta->cctrl.ts.tie_breaker_id;\n  // update group membership mask for replay acks\n  bv_copy((bit_vector_t*)&keys_meta->ack_bv, curr_membership.w_ack_init);\n\n  /// update op_ptr metadata\n  hermes_local_state_to_op(op_ptr, keys_meta);\n}\n\nstatic inline void\nhermes_check_membership_n_write_replay_actions(\n    spacetime_op_t* op_ptr, uint8_t idx, spacetime_object_meta* keys_meta,\n    spacetime_group_membership curr_membership)\n{\n  uint8_t node_id = (uint8_t)(!ENABLE_VIRTUAL_NODE_IDS\n                                  ? keys_meta->last_writer_id\n                                  : keys_meta->last_writer_id % machine_num);\n\n  if (node_is_in_membership(curr_membership, node_id))\n    op_ptr->op_meta.state = ST_GET_STALL;\n\n  else if (keys_meta->op_buffer_index == ST_OP_BUFFER_INDEX_EMPTY)\n    /// stall replay: until all acks from last write arrive\n    /// on multiple threads we can't complete writes / replays on VAL\n    hermes_write_replay_actions(op_ptr, idx, keys_meta, curr_membership);\n}\n\nstatic inline void\nhermes_marshal_write_coalesce_optimization(spacetime_op_t* op_ptr,\n                                           uint16_t curr_ts_version)\n{\n  if (ENABLE_ASSERTIONS) assert(op_ptr->op_meta.opcode == ST_OP_PUT);\n\n  if (ENABLE_WRITE_COALESCE_TO_THE_SAME_KEY_IN_SAME_NODE &&\n      op_ptr->op_meta.ts.version == 0) {\n    // if its the first time we stall on this read store the timestamp\n    op_ptr->op_meta.ts.version = curr_ts_version;\n    op_ptr->op_meta.state = ST_IN_PROGRESS_PUT;\n  }\n}\n\nstatic inline void\nhermes_complete_coalesced_write(spacetime_op_t* op_ptr, uint16_t curr_ts)\n{\n  if (ENABLE_ASSERTIONS) assert(op_ptr->op_meta.opcode == ST_OP_PUT);\n\n  if (ENABLE_WRITE_COALESCE_TO_THE_SAME_KEY_IN_SAME_NODE &&\n      op_ptr->op_meta.state == ST_PUT_STALL)\n    if (op_ptr->op_meta.ts.version > 0 &&\n        op_ptr->op_meta.ts.version + 1 < curr_ts) {\n      // if the timestamp we saw initially has smaller than 2 versions it means\n      // that the local write we coalesced with is completed\n      op_ptr->op_meta.state = ST_PUT_COMPLETE;\n    }\n}\n\nstatic inline void\nhermes_complete_hot_read_optimization(spacetime_op_t* op_ptr, timestamp_t ts)\n{\n  if (ENABLE_READ_COMPLETE_AFTER_VAL_RECV_OF_HOT_REQS &&\n      op_ptr->op_meta.state == ST_GET_STALL) {\n    if (op_ptr->op_meta.ts.version == 0 &&\n        op_ptr->op_meta.ts.tie_breaker_id == 0) {\n      // if its the first time we stall on this read store the timestamp\n      op_ptr->op_meta.ts.version = ts.version;\n      op_ptr->op_meta.ts.tie_breaker_id = ts.tie_breaker_id;\n\n    } else if (op_ptr->op_meta.ts.version + 1 < ts.version) {\n      // if the timestamp we saw initially has smaller than 2 versions complete\n      // the read;\n      // TODO we also need to get the value here\n      op_ptr->op_meta.state = ST_GET_COMPLETE;\n    }\n  }\n}\n\nstatic inline void\nhermes_read_actions(spacetime_op_t* op_ptr, struct mica_op* kv_ptr,\n                    uint8_t* kv_value_ptr)\n{\n  memcpy(op_ptr->value, kv_value_ptr, ST_VALUE_SIZE);\n  op_ptr->op_meta.state = ST_GET_COMPLETE;\n  op_ptr->op_meta.val_len = get_val_len(kv_ptr);\n}\n\n//////////// Exec op functions\nstatic inline void\nhermes_exec_read(spacetime_op_t* op_ptr, struct mica_op* kv_ptr, uint8_t idx,\n                 spacetime_group_membership curr_membership)\n{\n  if (ENABLE_ASSERTIONS) assert(op_ptr->op_meta.opcode == ST_OP_GET);\n\n  timestamp_t curr_ts;\n  spacetime_object_meta prev_meta;\n  spacetime_object_meta* keys_meta = (spacetime_object_meta*)kv_ptr->value;\n  uint8_t* kv_value_ptr = (uint8_t*)&keys_meta[1];\n\n  // Lock free reads through versioning (successful when version is even)\n  uint8_t was_locked_read = 0;\n  op_ptr->op_meta.state = ST_EMPTY;\n  do {\n    prev_meta = *keys_meta;\n    curr_ts = keys_meta->cctrl.ts;\n    // switch template with all states\n    switch (keys_meta->state) {\n      case VALID_STATE:\n        hermes_read_actions(op_ptr, kv_ptr, kv_value_ptr);\n        break;\n\n      case INVALID_WRITE_STATE:\n      case WRITE_STATE:\n      case REPLAY_STATE:\n        op_ptr->op_meta.state = ST_GET_STALL;\n        break;\n\n      default:\n        was_locked_read = 1;\n        cctrl_lock(&keys_meta->cctrl);\n        curr_ts = keys_meta->cctrl.ts;\n        curr_ts.version -= 1;  // WARNING: when locking we do version++\n\n        switch (keys_meta->state) {\n          case VALID_STATE:\n            hermes_read_actions(op_ptr, kv_ptr, kv_value_ptr);\n            break;\n\n          case INVALID_WRITE_STATE:\n          case WRITE_STATE:\n          case REPLAY_STATE:\n            op_ptr->op_meta.state = ST_GET_STALL;\n            break;\n\n          case INVALID_STATE:\n            hermes_check_membership_n_write_replay_actions(\n                op_ptr, idx, keys_meta, curr_membership);\n            break;\n\n          default:\n            assert(0);\n        }\n        cctrl_unlock_dec_version(&keys_meta->cctrl);\n        break;\n    }\n  } while (\n      !cctrl_timestamp_is_same_and_valid(&prev_meta.cctrl, &keys_meta->cctrl) &&\n      was_locked_read == 0);\n\n  hermes_complete_hot_read_optimization(op_ptr, curr_ts);\n}\n\nstatic inline void\nhermes_exec_write(spacetime_op_t* op_ptr, struct mica_op* kv_ptr, uint8_t idx,\n                  spacetime_group_membership curr_membership)\n{\n  if (ENABLE_ASSERTIONS) {\n    assert(op_ptr->op_meta.opcode == ST_OP_PUT);\n    assert(op_ptr->op_meta.val_len == (ST_VALUE_SIZE >> SHIFT_BITS));\n  }\n\n  spacetime_object_meta* keys_meta = (spacetime_object_meta*)kv_ptr->value;\n\n  op_ptr->op_meta.state = ST_EMPTY;\n  cctrl_lock(&keys_meta->cctrl);\n  uint16_t curr_version = (uint16_t)(keys_meta->cctrl.ts.version - 1);\n  switch (keys_meta->state) {\n    case VALID_STATE:\n    case INVALID_STATE:\n      if (keys_meta->op_buffer_index != ST_OP_BUFFER_INDEX_EMPTY) {\n        /// stall write: until all acks from last write arrive\n        /// on multiple threads we can't complete writes / replays on VAL\n        cctrl_unlock_dec_version(&keys_meta->cctrl);\n        hermes_marshal_write_coalesce_optimization(op_ptr, curr_version);\n\n      } else\n        hermes_update_actions_n_unlock(op_ptr, kv_ptr, keys_meta, idx,\n                                       curr_membership, 0);\n      break;\n\n    case INVALID_WRITE_STATE:\n    case WRITE_STATE:\n      hermes_marshal_write_coalesce_optimization(op_ptr, curr_version);\n    case REPLAY_STATE:\n      cctrl_unlock_dec_version(&keys_meta->cctrl);\n      break;\n    default:\n      assert(0);\n  }\n\n  // Fill this deterministic stuff after releasing the lock\n  if (op_ptr->op_meta.state != ST_PUT_SUCCESS)\n    op_ptr->op_meta.state = ST_PUT_STALL;\n\n  hermes_complete_coalesced_write(op_ptr, curr_version);\n}\n\nstatic inline void\nhermes_exec_rmw(spacetime_op_t* op_ptr, struct mica_op* kv_ptr, uint8_t idx,\n                spacetime_group_membership curr_membership)\n{\n  spacetime_object_meta* keys_meta = (spacetime_object_meta*)kv_ptr->value;\n\n  if (ENABLE_ASSERTIONS) {\n    assert(op_ptr->op_meta.opcode == ST_OP_RMW);\n    assert(op_ptr->op_meta.state == ST_NEW ||\n           op_ptr->op_meta.state == ST_RMW_STALL ||\n           op_ptr->op_meta.state == ST_IN_PROGRESS_RMW);\n    assert(op_ptr->op_meta.val_len == (ST_VALUE_SIZE >> SHIFT_BITS));\n  }\n\n  if (op_ptr->op_meta.state == ST_IN_PROGRESS_RMW) {\n    spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;\n    spacetime_object_meta lock_free_meta;\n    hermes_lock_free_read_obj_meta(&lock_free_meta, curr_meta);\n    if (timestamp_is_smaller(op_ptr->op_meta.ts.version,\n                             op_ptr->op_meta.ts.tie_breaker_id,\n                             lock_free_meta.cctrl.ts.version,\n                             lock_free_meta.cctrl.ts.tie_breaker_id)) {\n      // Abort RMW --> we saw higher TS before gathering all of its acks\n      op_ptr->op_meta.state = ST_RMW_ABORT;\n      cctrl_lock(&keys_meta->cctrl);\n      if (timestamp_is_equal(\n              op_ptr->op_meta.ts.version, op_ptr->op_meta.ts.tie_breaker_id,\n              lock_free_meta.last_local_write_ts.version,\n              lock_free_meta.last_local_write_ts.tie_breaker_id)) {\n        if (ENABLE_ASSERTIONS) assert(idx == curr_meta->op_buffer_index);\n        curr_meta->op_buffer_index = ST_OP_BUFFER_INDEX_EMPTY;\n      }\n      cctrl_unlock_dec_version(&keys_meta->cctrl);\n    }\n  } else {\n    op_ptr->op_meta.state = ST_EMPTY;\n\n    cctrl_lock(&keys_meta->cctrl);\n\n    switch (keys_meta->state) {\n      case VALID_STATE:\n        if (keys_meta->op_buffer_index != ST_OP_BUFFER_INDEX_EMPTY)\n          /// stall write: until all acks from last write arrive\n          /// on multiple threads we can't complete writes / replays on VAL\n          cctrl_unlock_dec_version(&keys_meta->cctrl);\n        else\n          hermes_update_actions_n_unlock(op_ptr, kv_ptr, keys_meta, idx,\n                                         curr_membership, 1);\n        break;\n\n      case INVALID_STATE:\n        hermes_check_membership_n_write_replay_actions(op_ptr, idx, keys_meta,\n                                                       curr_membership);\n        // Warning: Do not break\n      case INVALID_WRITE_STATE:\n      case WRITE_STATE:\n      case REPLAY_STATE:\n        cctrl_unlock_dec_version(&keys_meta->cctrl);\n        break;\n      default:\n        assert(0);\n        break;\n    }\n\n    // Fill this deterministic stuff after releasing the lock\n    if (op_ptr->op_meta.state != ST_RMW_SUCCESS &&\n        op_ptr->op_meta.state != ST_REPLAY_SUCCESS)\n      op_ptr->op_meta.state = ST_RMW_STALL;\n  }\n}\n\nstatic inline void\nhermes_exec_check_update_completion(spacetime_op_t* op_ptr,\n                                    struct mica_op* kv_ptr, uint8_t idx,\n                                    spacetime_group_membership curr_membership)\n{\n  spacetime_object_meta lock_free_read_meta;\n  spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;\n  hermes_lock_free_read_obj_meta(&lock_free_read_meta, curr_meta);\n\n  if (ENABLE_ASSERTIONS) {\n    assert(op_ptr->op_meta.opcode == ST_OP_PUT ||\n           op_ptr->op_meta.opcode == ST_OP_RMW ||\n           op_ptr->op_meta.state == ST_IN_PROGRESS_REPLAY);\n\n    assert(!timestamp_is_smaller(lock_free_read_meta.cctrl.ts.version,\n                                 lock_free_read_meta.cctrl.ts.tie_breaker_id,\n                                 op_ptr->op_meta.ts.version,\n                                 op_ptr->op_meta.ts.tie_breaker_id));\n  }\n\n  if (is_last_ack(lock_free_read_meta.ack_bv,\n                  curr_membership)) {  // if last local write completed\n    cctrl_lock(&curr_meta->cctrl);\n    if (is_last_ack(curr_meta->ack_bv, curr_membership)) {\n      if (ENABLE_ASSERTIONS) assert(curr_meta->op_buffer_index == idx);\n      curr_meta->op_buffer_index =\n          ST_OP_BUFFER_INDEX_EMPTY;  // reset the write buff index\n      switch (curr_meta->state) {\n        case INVALID_WRITE_STATE:\n          curr_meta->state = INVALID_STATE;\n          /// Warning break omitted intentionally\n        case VALID_STATE:\n        case INVALID_STATE:\n          op_ptr->op_meta.state = op_ptr->op_meta.opcode == ST_OP_PUT\n                                      ? ST_PUT_COMPLETE\n                                      : ST_RMW_COMPLETE;\n          break;\n        case WRITE_STATE:\n        case REPLAY_STATE:\n          op_ptr->op_meta.ts.version =\n              curr_meta->cctrl.ts.version -\n              1;  // -1 because of seqlock does version + 1\n          op_ptr->op_meta.ts.tie_breaker_id =\n              curr_meta->cctrl.ts.tie_breaker_id;\n          if (curr_meta->state == WRITE_STATE) {\n            op_ptr->op_meta.state = op_ptr->op_meta.opcode == ST_OP_PUT\n                                        ? ST_PUT_COMPLETE_SEND_VALS\n                                        : ST_RMW_COMPLETE_SEND_VALS;\n          } else {\n            if (ENABLE_ASSERTIONS)\n              assert(op_ptr->op_meta.state == ST_IN_PROGRESS_REPLAY);\n            op_ptr->op_meta.state = DISABLE_VALS_FOR_DEBUGGING == 1\n                                        ? ST_GET_COMPLETE\n                                        : ST_REPLAY_COMPLETE_SEND_VALS;\n          }\n          curr_meta->state = VALID_STATE;\n          break;\n        default:\n          assert(0);\n      }\n    }\n    cctrl_unlock_dec_version(&curr_meta->cctrl);\n  }\n}\n\n//////////// Exec protocol action functions\nstatic inline void\nhermes_exec_inv(spacetime_inv_t* inv_ptr, struct mica_op* kv_ptr,\n                spacetime_op_t* read_write_op)\n{\n  if (ENABLE_ASSERTIONS)\n    assert(inv_ptr->op_meta.opcode == ST_OP_INV ||\n           inv_ptr->op_meta.opcode == ST_OP_INV_ABORT);\n\n  spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;\n  uint8_t* kv_value_ptr = (uint8_t*)&curr_meta[1];\n  spacetime_object_meta lock_free_meta;\n  hermes_lock_free_read_obj_meta(&lock_free_meta, curr_meta);\n\n  // proceed iff remote.TS >= local.TS || inv is for an RMW to respond with an\n  // INV-abort\n  if (!timestamp_is_smaller(inv_ptr->op_meta.ts.version,\n                            inv_ptr->op_meta.ts.tie_breaker_id,\n                            lock_free_meta.cctrl.ts.version,\n                            lock_free_meta.cctrl.ts.tie_breaker_id) ||\n      (ENABLE_RMWs && inv_ptr->RMW_flag == 1)) {\n    // Lock and check again if inv TS > local timestamp\n    cctrl_lock(&curr_meta->cctrl);\n    /// Warning: use curr_meta->ts.version - 1 bellow since seqlock increases\n    /// curr_meta->ts.version by 1\n    if (timestamp_is_smaller(\n            curr_meta->cctrl.ts.version - 1, curr_meta->cctrl.ts.tie_breaker_id,\n            inv_ptr->op_meta.ts.version, inv_ptr->op_meta.ts.tie_breaker_id)) {\n      //\t\t\tprintf(\"Received an invalidation with >=\n      // timestamp\\n\");\n      /// Update state\n      switch (curr_meta->state) {\n        case VALID_STATE:\n          curr_meta->state = INVALID_STATE;\n        case INVALID_STATE:\n        case INVALID_WRITE_STATE:\n          break;\n\n        case WRITE_STATE:\n        case REPLAY_STATE:\n          curr_meta->state = ENABLE_RMWs && curr_meta->RMW_flag == 1\n                                 ? INVALID_STATE\n                                 : INVALID_WRITE_STATE;\n          break;\n          //\t\t\t\tcase REPLAY_STATE:\n          //\t\t\t\t\tcurr_meta->state =\n          // INVALID_WRITE_STATE;\n          // curr_meta->state = INVALID_STATE;\n          //\t\t\t\t\t//recover the read\n          //\t\t\t\t\tif(ENABLE_ASSERTIONS){\n          //\t\t\t\t\t\tassert(curr_meta->op_buffer_index\n          //!= ST_OP_BUFFER_INDEX_EMPTY);\n          //\t\t\t\t\t\tassert(read_write_op[curr_meta->op_buffer_index].state\n          //== ST_IN_PROGRESS_REPLAY);\n          // assert(((uint64_t\n          //*) &read_write_op[curr_meta->op_buffer_index].key)[0] == ((uint64_t\n          //*)\n          //&(*op)[I].key)[0]);\n          //\t\t\t\t\t}\n          //\t\t\t\t\tread_write_op[curr_meta->op_buffer_index].state\n          //= ST_NEW; curr_meta->op_buffer_index = ST_OP_BUFFER_INDEX_EMPTY;\n          // break;\n        default:\n          assert(0);\n      }\n\n      if (ENABLE_ASSERTIONS)\n        assert(inv_ptr->op_meta.val_len == (ST_VALUE_SIZE >> SHIFT_BITS));\n\n      /// Update Value, TS, RMW_flag and last_writer_id\n      kv_ptr->val_len = KVS_VALUE_SIZE;\n      curr_meta->RMW_flag = inv_ptr->RMW_flag;\n      curr_meta->last_writer_id = inv_ptr->op_meta.sender;\n      memcpy(kv_value_ptr, inv_ptr->value, ST_VALUE_SIZE);\n\n      cctrl_unlock_custom_version(&curr_meta->cctrl,\n                                  inv_ptr->op_meta.ts.tie_breaker_id,\n                                  inv_ptr->op_meta.ts.version);\n\n    } else if (timestamp_is_equal(curr_meta->cctrl.ts.version - 1,\n                                  curr_meta->cctrl.ts.tie_breaker_id,\n                                  inv_ptr->op_meta.ts.version,\n                                  inv_ptr->op_meta.ts.tie_breaker_id)) {\n      if (curr_meta->state == WRITE_STATE)\n        inv_ptr->op_meta.opcode = ST_INV_OUT_OF_GROUP;\n\n      curr_meta->last_writer_id = inv_ptr->op_meta.sender;\n      cctrl_unlock_custom_version(&curr_meta->cctrl,\n                                  inv_ptr->op_meta.ts.tie_breaker_id,\n                                  inv_ptr->op_meta.ts.version);\n\n    } else {  // TS is Smaller\n      /// Respond with an inv-abort if its an RMW\n      if (ENABLE_RMWs && inv_ptr->RMW_flag == 1) {\n        uint8_t sender_id = inv_ptr->op_meta.sender;\n        hermes_local_state_to_op(inv_ptr, curr_meta);\n        inv_ptr->op_meta.sender = sender_id;\n        inv_ptr->op_meta.opcode = ST_OP_INV_ABORT;\n        colored_printf(RED, \"Sending OP_INV_ABORT\\n\");\n      }\n      cctrl_unlock_dec_version(&curr_meta->cctrl);\n    }\n  }\n\n  if (inv_ptr->op_meta.opcode != ST_OP_INV_ABORT &&\n      inv_ptr->op_meta.opcode != ST_INV_OUT_OF_GROUP)\n    inv_ptr->op_meta.opcode = ST_INV_SUCCESS;\n\n  if (ENABLE_ASSERTIONS)\n    assert(inv_ptr->op_meta.opcode == ST_OP_INV_ABORT ||\n           inv_ptr->op_meta.opcode == ST_INV_SUCCESS ||\n           inv_ptr->op_meta.opcode == ST_INV_OUT_OF_GROUP);\n}\n\nstatic inline void\nhermes_exec_ack(spacetime_ack_t* ack_ptr, struct mica_op* kv_ptr,\n                spacetime_group_membership curr_membership,\n                spacetime_op_t* read_write_op)\n{\n  int op_buff_indx = ST_OP_BUFFER_INDEX_EMPTY;\n  spacetime_object_meta lock_free_read_meta;\n  spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;\n  hermes_lock_free_read_obj_meta(&lock_free_read_meta, curr_meta);\n\n  if (ENABLE_ASSERTIONS)\n    assert(!timestamp_is_smaller(lock_free_read_meta.cctrl.ts.version,\n                                 lock_free_read_meta.cctrl.ts.tie_breaker_id,\n                                 ack_ptr->ts.version,\n                                 ack_ptr->ts.tie_breaker_id));\n\n  if (timestamp_is_equal(\n          ack_ptr->ts.version, ack_ptr->ts.tie_breaker_id,\n          lock_free_read_meta.last_local_write_ts.version,\n          lock_free_read_meta.last_local_write_ts.tie_breaker_id)) {\n    /// Lock and check again if ack TS == last local write\n    cctrl_lock(&curr_meta->cctrl);\n    if (curr_meta->op_buffer_index != ST_OP_BUFFER_INDEX_EMPTY &&\n        timestamp_is_equal(ack_ptr->ts.version, ack_ptr->ts.tie_breaker_id,\n                           curr_meta->last_local_write_ts.version,\n                           curr_meta->last_local_write_ts.tie_breaker_id)) {\n      bv_bit_set((bit_vector_t*)&curr_meta->ack_bv, ack_ptr->sender);\n      if (is_last_ack(curr_meta->ack_bv,\n                      curr_membership)) {  // if last local write completed\n        op_buff_indx = curr_meta->op_buffer_index;\n        switch (curr_meta->state) {\n          case VALID_STATE:\n          case INVALID_STATE:\n            ack_ptr->opcode = ST_LAST_ACK_NO_BCAST_SUCCESS;\n            curr_meta->op_buffer_index =\n                ST_OP_BUFFER_INDEX_EMPTY;  // reset the write buff index\n            break;\n          case INVALID_WRITE_STATE:\n            curr_meta->state = INVALID_STATE;\n            ack_ptr->opcode = ST_LAST_ACK_NO_BCAST_SUCCESS;\n            curr_meta->op_buffer_index =\n                ST_OP_BUFFER_INDEX_EMPTY;  // reset the write buff index\n            break;\n          case WRITE_STATE:\n          case REPLAY_STATE:\n            curr_meta->state = VALID_STATE;\n            ack_ptr->opcode = ST_LAST_ACK_SUCCESS;\n            curr_meta->op_buffer_index =\n                ST_OP_BUFFER_INDEX_EMPTY;  // reset the write buff index\n            break;\n          default:\n            assert(0);\n        }\n      }\n    }\n    cctrl_unlock_dec_version(&curr_meta->cctrl);\n  }\n\n  if (ack_ptr->opcode == ST_LAST_ACK_SUCCESS ||\n      ack_ptr->opcode == ST_LAST_ACK_NO_BCAST_SUCCESS) {\n    /// completed read / write --> remove it from the ops buffer\n    if (ENABLE_ASSERTIONS) {\n      assert(op_buff_indx != ST_OP_BUFFER_INDEX_EMPTY);\n      assert(read_write_op[op_buff_indx].op_meta.state == ST_IN_PROGRESS_PUT ||\n             read_write_op[op_buff_indx].op_meta.state == ST_IN_PROGRESS_RMW ||\n             read_write_op[op_buff_indx].op_meta.state ==\n                 ST_OP_MEMBERSHIP_CHANGE ||\n             read_write_op[op_buff_indx].op_meta.state ==\n                 ST_IN_PROGRESS_REPLAY);\n      assert(((uint64_t*)&read_write_op[op_buff_indx].op_meta.key)[0] ==\n             ((uint64_t*)&ack_ptr->key)[0]);\n    }\n    switch (read_write_op[op_buff_indx].op_meta.opcode) {\n      case ST_OP_GET:\n        read_write_op[op_buff_indx].op_meta.state = ST_NEW;\n        break;\n      case ST_OP_PUT:\n        read_write_op[op_buff_indx].op_meta.state = ST_PUT_COMPLETE;\n        break;\n      case ST_OP_RMW:\n        read_write_op[op_buff_indx].op_meta.state = ST_RMW_COMPLETE;\n        // TODO ad an OP to differentiate between RMW-replay and RMW complete\n        break;\n      default:\n        assert(0);\n    }\n  }\n\n  if (ack_ptr->opcode != ST_LAST_ACK_SUCCESS) ack_ptr->opcode = ST_ACK_SUCCESS;\n}\n\nstatic inline void\nhermes_exec_val(spacetime_val_t* val_ptr, struct mica_op* kv_ptr)\n{\n  spacetime_object_meta lock_free_read_meta;\n  spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value;\n  hermes_lock_free_read_obj_meta(&lock_free_read_meta, curr_meta);\n\n  /// lock and proceed iff remote.TS == local.TS\n  if (timestamp_is_equal(lock_free_read_meta.cctrl.ts.version,\n                         lock_free_read_meta.cctrl.ts.tie_breaker_id,\n                         val_ptr->ts.version, val_ptr->ts.tie_breaker_id)) {\n    /// Lock and check again if still TS == local timestamp\n    cctrl_lock(&curr_meta->cctrl);\n    /// Warning: use op.version + 1 bellow since optik_lock() increases\n    /// curr_meta->version by 1\n    if (timestamp_is_equal(curr_meta->cctrl.ts.version - 1,\n                           curr_meta->cctrl.ts.tie_breaker_id,\n                           val_ptr->ts.version, val_ptr->ts.tie_breaker_id)) {\n      if (ENABLE_ASSERTIONS)\n        assert(curr_meta->state !=\n               WRITE_STATE);  /// WARNING: this should not happen w/o this node\n                              /// removed from the group\n      curr_meta->state = VALID_STATE;\n    }\n    cctrl_unlock_dec_version(&curr_meta->cctrl);\n  }\n  val_ptr->opcode = ST_VAL_SUCCESS;\n}\n\n//////////// Skip functions\nstatic inline uint8_t\nhermes_skip_op(spacetime_op_t* op_ptr)\n{\n  return (uint8_t)((op_ptr->op_meta.state == ST_PUT_SUCCESS ||\n                    op_ptr->op_meta.state == ST_RMW_SUCCESS ||\n                    op_ptr->op_meta.state == ST_REPLAY_SUCCESS ||\n                    op_ptr->op_meta.state == ST_IN_PROGRESS_PUT ||\n                    //                       op_ptr->op_meta.state ==\n                    //                       ST_IN_PROGRESS_RMW ||\n                    op_ptr->op_meta.state == ST_IN_PROGRESS_REPLAY ||\n                    op_ptr->op_meta.state == ST_OP_MEMBERSHIP_CHANGE ||\n                    op_ptr->op_meta.state == ST_PUT_COMPLETE_SEND_VALS)\n                       ? 1\n                       : 0);\n}\n\nstatic inline uint8_t\nhermes_skip_op_after_membship_change(spacetime_op_t* op_ptr)\n{\n  return (uint8_t)((op_ptr->op_meta.state == ST_IN_PROGRESS_PUT ||\n                    op_ptr->op_meta.state == ST_IN_PROGRESS_RMW ||\n                    op_ptr->op_meta.state == ST_IN_PROGRESS_REPLAY)\n                       ? 0\n                       : 1);\n}\n\nstatic inline uint8_t\nhermes_skip_inv(spacetime_inv_t* inv_ptr, int* node_suspected)\n{\n  if (inv_ptr->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE) {\n    // TODO we need to do this only on the first skip\n    *node_suspected = inv_ptr->value[0];\n    printf(\"RECEIVED NODE SUSPICION: %d\\n\", *node_suspected);\n    return 1;\n  }\n  return 0;\n}\n\nstatic inline uint8_t\nhermes_skip_ack(spacetime_ack_t* ack_ptr)\n{\n  return (uint8_t)((ack_ptr->state == ST_OP_MEMBERSHIP_CHANGE) ? 1 : 0);\n}\n\n//////////// Dispatcher functions\n\nstatic inline uint8_t\nhermes_skip_dispatcher(enum hermes_batch_type_t type, void* ptr,\n                       int* node_suspected)\n{\n  switch (type) {\n    case local_ops:\n      return hermes_skip_op(ptr);\n    case local_ops_after_membership_change:\n      return hermes_skip_op_after_membship_change(ptr);\n    case invs:\n      return hermes_skip_inv(ptr, node_suspected);\n    case acks:\n      return hermes_skip_ack(ptr);\n    case vals:\n      return 0;\n    default:\n      assert(0);\n  }\n}\n\nstatic inline void\nhermes_assertions_begin_dispatcher(enum hermes_batch_type_t type, void* ptr)\n{\n  if (ENABLE_ASSERTIONS) switch (type) {\n      case local_ops:\n      case local_ops_after_membership_change:\n        break;\n      case invs:\n        hermes_assertions_begin_inv(ptr);\n        break;\n      case acks:\n        if (ENABLE_RMWs == 0)\n          hermes_assertions_begin_ack(ptr);\n        else {\n          spacetime_ack_t* ack_ptr = ptr;\n          if (ack_ptr->opcode == ST_OP_ACK)\n            hermes_assertions_begin_ack(ptr);\n          else if (ack_ptr->opcode == ST_OP_INV_ABORT) {\n            printf(\"RECVED: inv abort\\n\");\n            hermes_assertions_begin_inv(ptr);\n          } else {\n            printf(\"RECVED: %s\\n\", code_to_str(ack_ptr->opcode));\n            assert(0);\n          }\n        }\n        break;\n      case vals:\n        hermes_assertions_begin_val(ptr);\n        break;\n      default:\n        assert(0);\n    }\n}\n\nstatic inline void\nhermes_print_dispatcher(enum hermes_batch_type_t type, int op_num,\n                        uint8_t thread_id)\n{\n  if (ENABLE_BATCH_OP_PRINTS) switch (type) {\n      case local_ops:\n      case local_ops_after_membership_change:\n        break;\n      case invs:\n        if (ENABLE_INV_PRINTS && thread_id < MAX_THREADS_TO_PRINT)\n          colored_printf(RED, \"[W] Batch INVs (op num: %d)!\\n\", thread_id,\n                         op_num);\n        break;\n      case acks:\n        if (ENABLE_ACK_PRINTS && thread_id < MAX_THREADS_TO_PRINT)\n          colored_printf(RED, \"[W%d] Batch ACKs (op num: %d)!\\n\", thread_id,\n                         op_num);\n        break;\n      case vals:\n        if (ENABLE_VAL_PRINTS && thread_id < MAX_THREADS_TO_PRINT)\n          colored_printf(RED, \"[W%d] Batch VALs (op num: %d)!\\n\", thread_id,\n                         op_num);\n        break;\n      default:\n        assert(0);\n    }\n}\n\nstatic inline void\nhermes_assertions_end_dispatcher(enum hermes_batch_type_t type,\n                                 spacetime_op_t* read_write_ops)\n{\n  if (ENABLE_ASSERTIONS) switch (type) {\n      case local_ops:\n      case local_ops_after_membership_change:\n      case invs:\n        break;\n      case acks:\n        hermes_assertions_end_read_write_ops(read_write_ops);\n        break;\n      case vals:\n        break;\n      default:\n        assert(0);\n    }\n}\n\nstatic inline void\nhermes_exec_dispatcher(enum hermes_batch_type_t type, void* op_ptr,\n                       struct mica_op* kv_ptr,\n                       spacetime_group_membership curr_membership, uint8_t idx,\n                       spacetime_op_t* read_write_op)\n{\n  switch (type) {\n    case local_ops:\n      if (((spacetime_op_t*)op_ptr)->op_meta.opcode == ST_OP_GET)\n        hermes_exec_read(op_ptr, kv_ptr, idx, curr_membership);\n      else if (((spacetime_op_t*)op_ptr)->op_meta.opcode == ST_OP_PUT)\n        hermes_exec_write(op_ptr, kv_ptr, idx, curr_membership);\n      else if (ENABLE_RMWs &&\n               ((spacetime_op_t*)op_ptr)->op_meta.opcode == ST_OP_RMW)\n        hermes_exec_rmw(op_ptr, kv_ptr, idx, curr_membership);\n      else {\n        printf(\"Ops[%d]: %s\\n\", idx,\n               code_to_str(((spacetime_op_t*)op_ptr)->op_meta.opcode));\n        assert(0);\n      }\n      break;\n    case local_ops_after_membership_change:\n      if (((spacetime_op_t*)op_ptr)->op_meta.opcode == ST_OP_PUT ||\n          ((spacetime_op_t*)op_ptr)->op_meta.opcode == ST_OP_RMW ||\n          ((spacetime_op_t*)op_ptr)->op_meta.state == ST_IN_PROGRESS_REPLAY) {\n        hermes_exec_check_update_completion(op_ptr, kv_ptr, idx,\n                                            curr_membership);\n      } else\n        assert(0);\n      break;\n    case invs:\n      hermes_exec_inv(op_ptr, kv_ptr, read_write_op);\n      break;\n    case acks:\n      if (ENABLE_RMWs == 0)\n        hermes_exec_ack(op_ptr, kv_ptr, curr_membership, read_write_op);\n      else {\n        spacetime_ack_t* ack_ptr = op_ptr;\n        if (ack_ptr->opcode == ST_OP_ACK)\n          hermes_exec_ack(op_ptr, kv_ptr, curr_membership, read_write_op);\n        else if (ack_ptr->opcode == ST_OP_INV_ABORT) {\n          /// TODO RMW debugging\n          printf(\"RECVED: inv abort\\n\");\n          hermes_exec_inv(op_ptr, kv_ptr, read_write_op);\n          ack_ptr->opcode = ST_ACK_SUCCESS;\n        } else\n          assert(0);\n      }\n      break;\n    case vals:\n      hermes_exec_val(op_ptr, kv_ptr);\n      break;\n    default:\n      assert(0);\n  }\n}\n\n//////////////////////////////////////////////\n//////////// Main HermesKV function\n//////////////////////////////////////////////\n\nvoid\nhermes_batch_ops_to_KVS(enum hermes_batch_type_t type, uint8_t* op_array,\n                        int op_num, uint16_t sizeof_op_elem,\n                        spacetime_group_membership curr_membership,\n                        int* node_suspected, spacetime_op_t* read_write_ops,\n                        uint8_t thread_id)\n{\n#if SPACETIME_DEBUG == 1\n  // assert(kv.hash_table != NULL);\n  assert(op_array != NULL);\n  assert(op_num > 0 && op_num <= CACHE_BATCH_SIZE);\n  assert(resp != NULL);\n#endif\n\n#if SPACETIME_DEBUG == 2\n  for (I = 0; I < op_num; I++)\n    mica_print_op(&(*op_array)[I]);\n#endif\n  int key_in_store[HERMES_MAX_BATCH_SIZE];  // Is this key in the datastore?\n  unsigned int tag[HERMES_MAX_BATCH_SIZE];\n  uint64_t bkt[HERMES_MAX_BATCH_SIZE];\n  struct mica_bkt* bkt_ptr[HERMES_MAX_BATCH_SIZE];\n  struct mica_op* kv_ptr[HERMES_MAX_BATCH_SIZE];  // Ptr to KV item in log\n\n  if (ENABLE_ASSERTIONS) {\n    assert(op_num <= HERMES_MAX_BATCH_SIZE);\n    assert(read_write_ops != NULL || type != acks);\n    assert(node_suspected != NULL || type != invs);\n  }\n\n  hermes_print_dispatcher(type, op_num, thread_id);\n  // We first lookup the key in the datastore.\n  // The first two @I loops work for both GETs and PUTs.\n  for (int I = 0; I < op_num; I++) {\n    spacetime_op_meta_t* op_ptr =\n        (spacetime_op_meta_t*)&op_array[sizeof_op_elem * I];\n    hermes_assertions_begin_dispatcher(type, op_ptr);\n    if (hermes_skip_dispatcher(type, op_ptr, node_suspected)) continue;\n\n    bkt[I] = op_ptr->key.bkt & kv.hash_table.bkt_mask;\n    bkt_ptr[I] = &kv.hash_table.ht_index[bkt[I]];\n    __builtin_prefetch(bkt_ptr[I], 0, 0);\n    tag[I] = op_ptr->key.tag;\n\n    key_in_store[I] = 0;\n    kv_ptr[I] = NULL;\n  }\n\n  for (int I = 0; I < op_num; I++) {\n    spacetime_op_meta_t* op_ptr =\n        (spacetime_op_meta_t*)&op_array[sizeof_op_elem * I];\n    if (hermes_skip_dispatcher(type, op_ptr, node_suspected)) continue;\n    for (int j = 0; j < 8; j++) {\n      if (bkt_ptr[I]->slots[j].in_use == 1 &&\n          bkt_ptr[I]->slots[j].tag == tag[I]) {\n        uint64_t log_offset =\n            bkt_ptr[I]->slots[j].offset & kv.hash_table.log_mask;\n        // We can interpret the log entry as mica_op, even though it\n        // may not contain the full MICA_MAX_VALUE value.\n        kv_ptr[I] = (struct mica_op*)&kv.hash_table.ht_log[log_offset];\n\n        // Small values (1--64 bytes) can span 2 cache lines\n        __builtin_prefetch(kv_ptr[I], 0, 0);\n        __builtin_prefetch((uint8_t*)kv_ptr[I] + 64, 0, 0);\n\n        // Detect if the head has wrapped around for this index entry\n        if (kv.hash_table.log_head - bkt_ptr[I]->slots[j].offset >=\n            kv.hash_table.log_cap)\n          kv_ptr[I] = NULL;  // If so, we mark it \"not found\"\n\n        break;\n      }\n    }\n  }\n\n  for (int I = 0; I < op_num; I++) {\n    spacetime_op_meta_t* op_ptr =\n        (spacetime_op_meta_t*)&op_array[sizeof_op_elem * I];\n    if (hermes_skip_dispatcher(type, op_ptr, node_suspected)) continue;\n    if (kv_ptr[I] != NULL) {\n      // We had a tag match earlier. Now compare log entry.\n      long long* key_ptr_log = (long long*)kv_ptr[I];\n      long long* key_ptr_req = (long long*)&op_ptr->key;\n\n      if (key_ptr_log[1] == key_ptr_req[0]) {  // Key Found 8 Byte keys\n        key_in_store[I] = 1;\n        hermes_exec_dispatcher(type, op_ptr, kv_ptr[I], curr_membership,\n                               (uint8_t)I, read_write_ops);\n      }\n    }\n\n    if (key_in_store[I] ==\n        0)  // KVS miss --> We get here if either tag or log key match failed\n      op_ptr->state = ST_MISS;\n  }\n\n  hermes_assertions_end_dispatcher(type, read_write_ops);\n}\n"
  },
  {
    "path": "src/hermes/hermes_worker.c",
    "content": "#include <spacetime.h>\n#include <time.h>\n#include \"../../include/utils/concur_ctrl.h\"\n#include \"inline-util.h\"\n#include \"util.h\"\n\n///\n#include \"../../include/hades/hades.h\"\n#include \"../../include/wings/wings.h\"\n///\n\nint\ninv_skip_or_get_sender_id(uint8_t* req)\n{\n  spacetime_op_t* op_req = (spacetime_op_t*)req;\n\n  if (ENABLE_ASSERTIONS) {\n    assert(is_response_code(op_req->op_meta.state) ||\n           is_bucket_state_code(op_req->op_meta.state));\n    assert(is_input_code(op_req->op_meta.opcode));\n  }\n\n  if (op_req->op_meta.state != ST_PUT_SUCCESS &&\n      op_req->op_meta.state != ST_RMW_SUCCESS &&\n      op_req->op_meta.state != ST_REPLAY_SUCCESS &&\n      op_req->op_meta.state != ST_OP_MEMBERSHIP_CHANGE)\n    return -1;\n  return 0;  // since inv is a bcast we can return any int other than -1\n}\n\nvoid\ninv_modify_elem_after_send(uint8_t* req)\n{\n  spacetime_op_t* op_req = (spacetime_op_t*)req;\n  switch (op_req->op_meta.state) {\n    case ST_PUT_SUCCESS:\n      op_req->op_meta.state = ST_IN_PROGRESS_PUT;\n      break;\n    case ST_RMW_SUCCESS:\n      op_req->op_meta.state = ST_IN_PROGRESS_RMW;\n      break;\n    case ST_REPLAY_SUCCESS:\n      op_req->op_meta.state = ST_IN_PROGRESS_REPLAY;\n      break;\n    case ST_OP_MEMBERSHIP_CHANGE:\n      op_req->op_meta.state = ST_OP_MEMBERSHIP_COMPLETE;\n      break;\n    default:\n      assert(0);\n  }\n}\n\nvoid\ninv_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)\n{\n  spacetime_op_t* op = (spacetime_op_t*)triggering_req;\n  spacetime_inv_t* inv_to_send = (spacetime_inv_t*)msg_to_send;\n\n  // Copy op to inv, set sender and opcode\n  memcpy(inv_to_send, op, sizeof(spacetime_inv_t));\n  inv_to_send->op_meta.sender = (uint8_t)machine_id;\n  inv_to_send->op_meta.opcode = ST_OP_INV;\n  //\t//TODO change to include membership change\n  //\tinv_to_send->op_meta.opcode = (uint8_t) (op->op_meta.state ==\n  // ST_OP_MEMBERSHIP_CHANGE ?\n  // ST_OP_MEMBERSHIP_CHANGE : ST_OP_INV);\n}\n\nint\nack_skip_or_get_sender_id(uint8_t* req)\n{\n  spacetime_inv_t* inv_req = (spacetime_inv_t*)req;\n\n  if (ENABLE_ASSERTIONS)\n    assert(inv_req->op_meta.opcode == ST_INV_SUCCESS ||\n           inv_req->op_meta.opcode == ST_OP_INV_ABORT ||\n           inv_req->op_meta.opcode == ST_EMPTY);\n\n  uint8_t is_small_msg = inv_req->op_meta.opcode == ST_INV_SUCCESS ? 1 : 0;\n\n  return inv_req->op_meta.opcode == ST_EMPTY\n             ? -1\n             : wings_set_sender_id_n_msg_type(inv_req->op_meta.sender,\n                                              is_small_msg);\n}\n\nvoid\nack_modify_elem_after_send(uint8_t* req)\n{\n  spacetime_inv_t* inv_req = (spacetime_inv_t*)req;\n\n  // empty inv buffer\n  if (inv_req->op_meta.opcode == ST_INV_SUCCESS ||\n      inv_req->op_meta.opcode == ST_OP_INV_ABORT ||\n      inv_req->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE)\n    inv_req->op_meta.opcode = ST_EMPTY;\n  else\n    assert(0);\n}\n\nvoid\nack_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)\n{\n  spacetime_inv_t* inv_req = (spacetime_inv_t*)triggering_req;\n  spacetime_ack_t* ack_to_send = (spacetime_ack_t*)msg_to_send;\n  spacetime_inv_t* inv_to_send = (spacetime_inv_t*)msg_to_send;\n  switch (inv_req->op_meta.opcode) {\n    case ST_INV_SUCCESS:\n      memcpy(ack_to_send, triggering_req,\n             sizeof(spacetime_ack_t));  // copy req to next_req_ptr\n      ack_to_send->sender = (uint8_t)machine_id;\n      ack_to_send->opcode = ST_OP_ACK;\n      break;\n    case ST_OP_INV_ABORT:\n      memcpy(inv_to_send, triggering_req, sizeof(spacetime_inv_t));\n      inv_to_send->op_meta.sender = (uint8_t)machine_id;\n      inv_to_send->op_meta.opcode = ST_OP_INV_ABORT;\n      break;\n    default:\n      assert(0);\n  }\n}\n\nint\nval_skip_or_get_sender_id(uint8_t* req)\n{\n  spacetime_ack_t* ack_req = (spacetime_ack_t*)req;\n  if (ack_req->opcode == ST_ACK_SUCCESS ||\n      ack_req->opcode == ST_OP_MEMBERSHIP_CHANGE) {\n    ack_req->opcode = ST_EMPTY;\n    return -1;\n  } else if (ack_req->opcode == ST_EMPTY)\n    return -1;\n\n  if (ENABLE_ASSERTIONS) assert(ack_req->opcode == ST_LAST_ACK_SUCCESS);\n\n  return ack_req->sender;\n}\n\nvoid\nval_modify_elem_after_send(uint8_t* req)\n{\n  spacetime_ack_t* ack_req = (spacetime_ack_t*)req;\n\n  if (ENABLE_ASSERTIONS) assert(ack_req->opcode == ST_LAST_ACK_SUCCESS);\n\n  ack_req->opcode = ST_EMPTY;\n}\n\nvoid\nval_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)\n{\n  spacetime_val_t* val_to_send = (spacetime_val_t*)msg_to_send;\n\n  memcpy(val_to_send, triggering_req,\n         sizeof(spacetime_val_t));  // copy req to next_req_ptr\n  val_to_send->opcode = ST_OP_VAL;\n  val_to_send->sender = (uint8_t)machine_id;\n}\n\nint\nmemb_change_skip_or_get_sender_id(uint8_t* req)\n{\n  spacetime_op_t* op_req = (spacetime_op_t*)req;\n  if (op_req->op_meta.state != ST_PUT_COMPLETE_SEND_VALS &&\n      op_req->op_meta.state != ST_RMW_COMPLETE_SEND_VALS &&\n      op_req->op_meta.state != ST_REPLAY_COMPLETE_SEND_VALS) {\n    return -1;\n  }\n  return 1;  // it is bcast so just return something greater than zero\n}\n\nvoid\nmemb_change_modify_elem_after_send(uint8_t* req)\n{\n  spacetime_op_t* op_req = (spacetime_op_t*)req;\n  switch (op_req->op_meta.state) {\n    case ST_PUT_COMPLETE_SEND_VALS:\n      op_req->op_meta.state = ST_PUT_COMPLETE;\n      break;\n    case ST_RMW_COMPLETE_SEND_VALS:\n      op_req->op_meta.state = ST_RMW_COMPLETE;\n      break;\n    case ST_REPLAY_COMPLETE_SEND_VALS:\n      op_req->op_meta.state = ST_NEW;  // ST_REPLAY_COMPLETE;\n      break;\n    default:\n      assert(0);\n  }\n}\n\nvoid\nmemb_change_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req)\n{\n  spacetime_op_t* op_req = (spacetime_op_t*)triggering_req;\n  spacetime_val_t* val_to_send = (spacetime_val_t*)msg_to_send;\n\n  val_to_send->opcode = ST_OP_VAL;\n  val_to_send->sender = (uint8_t)machine_id;\n  val_to_send->ts = op_req->op_meta.ts;\n}\n\nint\nrem_write_crd_skip_or_get_sender_id(uint8_t* req)\n{\n  spacetime_val_t* val_ptr = (spacetime_val_t*)req;\n\n  if (ENABLE_ASSERTIONS)\n    assert(val_ptr->opcode == ST_VAL_SUCCESS || val_ptr->opcode == ST_EMPTY);\n\n  return val_ptr->opcode == ST_EMPTY ? -1 : val_ptr->sender;\n}\n\nvoid\nrem_write_crd_modify_elem_after_send(uint8_t* req)\n{\n  spacetime_val_t* val_req = (spacetime_val_t*)req;\n\n  // empty inv buffer\n  if (val_req->opcode == ST_VAL_SUCCESS)\n    val_req->opcode = ST_EMPTY;\n  else\n    assert(0);\n}\n\nvoid\nprint_total_send_recv_msgs(ud_channel_t* inv_ud_c, ud_channel_t* ack_ud_c,\n                           ud_channel_t* val_ud_c, ud_channel_t* crd_ud_c)\n{\n  colored_printf(\n      GREEN, \"Total Send: invs %d, acks %d, vals %d, crds %d\\n\",\n      inv_ud_c->stats.send_total_msgs, ack_ud_c->stats.send_total_msgs,\n      val_ud_c->stats.send_total_msgs, crd_ud_c->stats.send_total_msgs);\n  colored_printf(\n      GREEN, \"Total Recv: invs %d, acks %d, vals %d, crds %d\\n\",\n      inv_ud_c->stats.recv_total_msgs, ack_ud_c->stats.recv_total_msgs,\n      val_ud_c->stats.recv_total_msgs, crd_ud_c->stats.recv_total_msgs);\n}\n\nvoid\nspin_until_all_nodes_are_in_membership(\n    spacetime_group_membership* last_group_membership,\n    hades_wings_ctx_t* hw_ctx, uint16_t worker_lid)\n{\n  bit_vector_t* membership_ptr =\n      (bit_vector_t*)&last_group_membership->g_membership;\n  bv_reset_all(membership_ptr);\n  while (bv_no_setted_bits(*membership_ptr) < machine_num) {\n    if (worker_lid == WORKER_WITH_FAILURE_DETECTOR) {\n      update_view_and_issue_hbs(hw_ctx);\n      if (!bv_are_equal(*membership_ptr, hw_ctx->ctx.curr_g_membership))\n        group_membership_update(hw_ctx->ctx);\n      poll_for_remote_views(hw_ctx);\n    }\n    *last_group_membership = group_membership;\n  }\n}\n\nstatic inline void\nfailure_detection_n_membership(ud_channel_t** ud_channel_ptrs,\n                               bit_vector_t* last_membership,\n                               hades_wings_ctx_t* hw_ctx, uint16_t worker_lid)\n{\n  if (worker_lid == WORKER_WITH_FAILURE_DETECTOR) {\n    update_view_and_issue_hbs(hw_ctx);\n\n    ///< TODO>: We need to fix recovery (RDMA side of wings)!! the following is\n    ///< not fully correct\n    /// Additionally, this handles only WORKER_WITH_FAILURE_DETECTOR thread\n    /// instead of every thread\n    if (!bv_are_equal(hw_ctx->ctx.last_local_view.view,\n                      hw_ctx->ctx.intermediate_local_view.view)) {\n      for (int j = 0; j < 8; ++j)\n        if (bv_bit_get(hw_ctx->ctx.last_local_view.view, j) == 0 &&\n            bv_bit_get(hw_ctx->ctx.intermediate_local_view.view, j) == 1) {\n          printf(\"W[%d]: updates %d endpoint channels\\n\", worker_lid, j);\n          for (int i = 0; i < TOTAL_WORKER_UD_QPs; ++i) {\n            wings_reset_credits(ud_channel_ptrs[i], j);\n            wings_reconfigure_wrs_ah(ud_channel_ptrs[i], j);\n          }\n        }\n    }\n    //</TODO>\n\n    if (!bv_are_equal(*last_membership, hw_ctx->ctx.curr_g_membership)) {\n      group_membership_update(hw_ctx->ctx);\n    }\n\n    poll_for_remote_views(hw_ctx);\n  }\n}\n\nvoid*\nrun_worker(void* arg)\n{\n  assert(is_CR == 0);\n\n  struct thread_params params = *(struct thread_params*)arg;\n  uint16_t worker_lid = (uint16_t)params.id;  // Local ID of this worker thread\n  uint16_t worker_gid =\n      (uint16_t)(machine_id * num_workers +\n                 params.id);  // Global ID of this worker thread\n\n  /* --------------------------------------------------------\n  ------------------- RDMA WINGS DECLARATIONS---------------\n  ---------------------------------------------------------*/\n  ud_channel_t ud_channels[TOTAL_WORKER_N_FAILURE_DETECTION_UD_QPs];\n  ud_channel_t* ud_channel_ptrs[TOTAL_WORKER_N_FAILURE_DETECTION_UD_QPs];\n  ud_channel_t* inv_ud_c = &ud_channels[INV_UD_QP_ID];\n  ud_channel_t* ack_ud_c = &ud_channels[ACK_UD_QP_ID];\n  ud_channel_t* val_ud_c = &ud_channels[VAL_UD_QP_ID];\n  ud_channel_t* crd_ud_c = &ud_channels[CRD_UD_QP_ID];\n\n  for (int i = 0; i < TOTAL_WORKER_N_FAILURE_DETECTION_UD_QPs; ++i)\n    ud_channel_ptrs[i] = &ud_channels[i];\n\n  const uint8_t is_bcast = 1;\n  const uint8_t stats_on = 1;\n  const uint8_t prints_on = 1;\n  const uint8_t is_hdr_only = 0;\n  const uint8_t expl_crd_ctrl = 0;\n  const uint8_t disable_crd_ctrl = 0;\n\n  char inv_qp_name[200], ack_qp_name[200], val_qp_name[200];\n  sprintf(inv_qp_name, \"%s%d\", \"\\033[31mINV\\033[0m\", worker_lid);\n  sprintf(ack_qp_name, \"%s%d\", \"\\033[33mACK\\033[0m\", worker_lid);\n  sprintf(val_qp_name, \"%s%d\", \"\\033[1m\\033[32mVAL\\033[0m\", worker_lid);\n\n  // WARNING: We use the ack channel to send/recv both acks and rmw-invs if RMWs\n  // are enabled\n  uint16_t ack_size =\n      ENABLE_RMWs ? sizeof(spacetime_inv_t) : sizeof(spacetime_ack_t);\n\n  uint8_t inv_inlining =\n      (DISABLE_INLINING == 0 &&\n       max_coalesce * sizeof(spacetime_inv_t) < WINGS_MAX_SUPPORTED_INLINING)\n          ? 1\n          : 0;\n  uint8_t ack_inlining =\n      (DISABLE_INLINING == 0 &&\n       max_coalesce * ack_size < WINGS_MAX_SUPPORTED_INLINING)\n          ? 1\n          : 0;\n  uint8_t val_inlining =\n      (DISABLE_INLINING == 0 &&\n       max_coalesce * sizeof(spacetime_val_t) < WINGS_MAX_SUPPORTED_INLINING)\n          ? 1\n          : 0;\n\n  wings_ud_channel_init(inv_ud_c, inv_qp_name, REQ, (uint8_t)max_coalesce,\n                        sizeof(spacetime_inv_t), 0, inv_inlining, is_hdr_only,\n                        is_bcast, disable_crd_ctrl, expl_crd_ctrl, ack_ud_c,\n                        (uint8_t)credits_num, machine_num, (uint8_t)machine_id,\n                        stats_on, prints_on);\n  wings_ud_channel_init(ack_ud_c, ack_qp_name, RESP, (uint8_t)max_coalesce,\n                        ack_size, sizeof(spacetime_ack_t), ack_inlining,\n                        is_hdr_only, 0, disable_crd_ctrl, expl_crd_ctrl,\n                        inv_ud_c, (uint8_t)credits_num, machine_num,\n                        (uint8_t)machine_id, stats_on, prints_on);\n  wings_ud_channel_init(val_ud_c, val_qp_name, REQ, (uint8_t)max_coalesce,\n                        sizeof(spacetime_val_t), 0, val_inlining, is_hdr_only,\n                        is_bcast, disable_crd_ctrl, 1, crd_ud_c,\n                        (uint8_t)credits_num, machine_num, (uint8_t)machine_id,\n                        stats_on, prints_on);\n\n  ///< HADES> Failure Detector Init\n  hades_wings_ctx_t hw_ctx;\n  uint16_t total_ud_qps = TOTAL_WORKER_UD_QPs;\n  if (ENABLE_HADES_FAILURE_DETECTION &&\n      worker_lid == WORKER_WITH_FAILURE_DETECTOR) {\n    total_ud_qps = TOTAL_WORKER_N_FAILURE_DETECTION_UD_QPs;\n    ud_channel_t* hviews_c = &ud_channels[TOTAL_WORKER_UD_QPs];\n    ud_channel_t* hviews_crd_c = &ud_channels[TOTAL_WORKER_UD_QPs + 1];\n\n    const uint16_t max_views_to_poll = 10;\n    const uint32_t send_view_every_us = 100;\n    const uint32_t update_local_view_ms = 10;\n\n    hades_wings_ctx_init(&hw_ctx, machine_id, machine_num, max_views_to_poll,\n                         send_view_every_us, update_local_view_ms, hviews_c,\n                         hviews_crd_c, worker_lid);\n  }\n  ///</HADES>\n\n  wings_setup_channel_qps_and_recvs(ud_channel_ptrs, total_ud_qps,\n                                    g_share_qs_barrier, worker_lid);\n\n  uint16_t ops_len =\n      (uint16_t)(credits_num * remote_machine_num *\n                 max_coalesce);  // credits * remote_machines * max_req_coalesce\n  assert(ops_len >= inv_ud_c->recv_pkt_buff_len);\n  assert(ops_len >= ack_ud_c->recv_pkt_buff_len);\n  assert(ops_len >= val_ud_c->recv_pkt_buff_len);\n\n  /* -------------------------------------------------------\n  ------------------- OTHER DECLARATIONS--------------------\n  ---------------------------------------------------------*/\n  // Intermediate buffs where reqs are copied from incoming_* buffs in order to\n  // get passed to the KVS\n  spacetime_op_t* ops;\n  spacetime_inv_t* inv_recv_ops;\n  spacetime_ack_t*\n      ack_recv_ops;  // WARNING!! This can be spacetime_ack_t / spacetime_inv_t\n                     // * depends if RMWs are disabled or not\n  spacetime_val_t* val_recv_ops;\n\n  setup_kvs_buffs(&ops, &inv_recv_ops, &ack_recv_ops, &val_recv_ops);\n\n  struct spacetime_trace_command* trace;\n  trace_init(&trace, worker_gid);\n\n  ////\n  spacetime_op_t* n_hottest_keys_in_ops_get[COALESCE_N_HOTTEST_KEYS];\n  spacetime_op_t* n_hottest_keys_in_ops_put[COALESCE_N_HOTTEST_KEYS];\n  for (int i = 0; i < COALESCE_N_HOTTEST_KEYS; ++i) {\n    n_hottest_keys_in_ops_get[i] = NULL;\n    n_hottest_keys_in_ops_put[i] = NULL;\n  }\n  ////\n\n  int node_suspected = -1;\n  uint32_t trace_iter = 0;\n  uint16_t rolling_inv_index = 0;\n  uint16_t invs_polled = 0, acks_polled = 0, vals_polled = 0;\n  uint8_t has_outstanding_vals = 0, has_remaining_vals_from_memb_change = 0;\n\n  uint32_t* num_of_iters_serving_op = malloc(max_batch_size * sizeof(uint32_t));\n  for (int i = 0; i < max_batch_size; ++i)\n    num_of_iters_serving_op[i] = 0;\n\n  /// Spawn stats thread\n  if (worker_lid == 0) {\n    if (spawn_stats_thread() != 0)\n      colored_printf(RED, \"Stats thread was not successfully spawned \\n\");\n  }\n\n  struct timespec stopwatch_for_req_latency;\n\n  // Membership init\n  bit_vector_t* membership_ptr =\n      ENABLE_HADES_FAILURE_DETECTION\n          ? (bit_vector_t*)&group_membership.g_membership\n          : NULL;\n  if (ENABLE_HADES_FAILURE_DETECTION) {\n    spin_until_all_nodes_are_in_membership(&group_membership, &hw_ctx,\n                                           worker_lid);\n    printf(\"~~~~~~~~~ Starting while ! ~~~~~~~~~\\n\");\n  }\n\n  /* -----------------------------------------------------\n ------------------------Main Loop--------------------\n     ----------------------------------------------------- */\n\n  struct timespec stopwatch_for_fd_warmup;\n  get_rdtsc_timespec(&stopwatch_for_fd_warmup);\n  uint8_t fd_warmup_time_has_passed = 0;\n\n  while (true) {\n    // Check something periodically (e.g., stats)\n    if (unlikely(w_stats[worker_lid].total_loops % M_16 == 0)) {\n      //\t\t\tprint_total_send_recv_msgs_n_credits(&inv_ud_c,\n      //&ack_ud_c, &val_ud_c, &crd_ud_c);\n    }\n\n    if (!ENABLE_HADES_FAILURE_DETECTION || fd_warmup_time_has_passed == 1) {\n      node_suspected =\n          refill_ops(&trace_iter, worker_lid, trace, ops,\n                     num_of_iters_serving_op, &stopwatch_for_req_latency,\n                     n_hottest_keys_in_ops_get, n_hottest_keys_in_ops_put);\n\n      hermes_batch_ops_to_KVS(local_ops, (uint8_t*)ops, max_batch_size,\n                              sizeof(spacetime_op_t), group_membership, NULL,\n                              NULL, (uint8_t)worker_lid);\n\n      stop_latency_of_completed_reads(ops, worker_lid,\n                                      &stopwatch_for_req_latency);\n\n      if (update_ratio > 0) {\n        ///~~~~~~~~~~~~~~~~~~~~~~INVS~~~~~~~~~~~~~~~~~~~~~~~~~~~\n        wings_issue_pkts(inv_ud_c, membership_ptr, (uint8_t*)ops,\n                         (uint16_t)max_batch_size, sizeof(spacetime_op_t),\n                         &rolling_inv_index, inv_skip_or_get_sender_id,\n                         inv_modify_elem_after_send, inv_copy_and_modify_elem);\n\n        /// Poll for INVs\n        invs_polled = wings_poll_buff_and_post_recvs(inv_ud_c, ops_len,\n                                                     (uint8_t*)inv_recv_ops);\n\n        if (invs_polled > 0) {\n          hermes_batch_ops_to_KVS(invs, (uint8_t*)inv_recv_ops, invs_polled,\n                                  sizeof(spacetime_inv_t), group_membership,\n                                  &node_suspected, ops, (uint8_t)worker_lid);\n\n          ///~~~~~~~~~~~~~~~~~~~~~~ACKS~~~~~~~~~~~~~~~~~~~~~~~~~~~\n          wings_issue_pkts(\n              ack_ud_c, membership_ptr, (uint8_t*)inv_recv_ops, invs_polled,\n              sizeof(spacetime_inv_t), NULL, ack_skip_or_get_sender_id,\n              ack_modify_elem_after_send, ack_copy_and_modify_elem);\n\n          if (ENABLE_ASSERTIONS)\n            assert(inv_ud_c->stats.recv_total_msgs ==\n                   ack_ud_c->stats.send_total_msgs);\n        }\n\n        if (has_outstanding_vals == 0 &&\n            has_remaining_vals_from_memb_change == 0) {\n          /// Poll for Acks\n          acks_polled = wings_poll_buff_and_post_recvs(ack_ud_c, ops_len,\n                                                       (uint8_t*)ack_recv_ops);\n\n          if (acks_polled > 0) {\n            hermes_batch_ops_to_KVS(acks, (uint8_t*)ack_recv_ops, acks_polled,\n                                    ack_size, group_membership, NULL, ops,\n                                    (uint8_t)worker_lid);\n\n            stop_latency_of_completed_writes(ops, worker_lid,\n                                             &stopwatch_for_req_latency);\n          }\n        }\n\n        if (!DISABLE_VALS_FOR_DEBUGGING) {\n          ///~~~~~~~~~~~~~~~~~~~~~~ VALs ~~~~~~~~~~~~~~~~~~~~~~~~~~~\n          if (has_remaining_vals_from_memb_change > 0)\n            has_remaining_vals_from_memb_change = wings_issue_pkts(\n                val_ud_c, membership_ptr, (uint8_t*)ops, max_batch_size,\n                sizeof(spacetime_op_t), NULL, memb_change_skip_or_get_sender_id,\n                memb_change_modify_elem_after_send,\n                memb_change_copy_and_modify_elem);\n          else\n            has_outstanding_vals = wings_issue_pkts(\n                val_ud_c, membership_ptr, (uint8_t*)ack_recv_ops,\n                ack_ud_c->recv_pkt_buff_len, ack_size, NULL,\n                val_skip_or_get_sender_id, val_modify_elem_after_send,\n                val_copy_and_modify_elem);\n\n          /// Poll for Vals\n          vals_polled = wings_poll_buff_and_post_recvs(val_ud_c, ops_len,\n                                                       (uint8_t*)val_recv_ops);\n\n          if (vals_polled > 0) {\n            hermes_batch_ops_to_KVS(vals, (uint8_t*)val_recv_ops, vals_polled,\n                                    sizeof(spacetime_val_t), group_membership,\n                                    NULL, NULL, (uint8_t)worker_lid);\n\n            ///~~~~~~~~~~~~~~~~~~~~~~CREDITS~~~~~~~~~~~~~~~~~~~~~~~~~~~\n            wings_issue_credits(\n                crd_ud_c, membership_ptr, (uint8_t*)val_recv_ops, ops_len,\n                sizeof(spacetime_val_t), rem_write_crd_skip_or_get_sender_id,\n                rem_write_crd_modify_elem_after_send);\n          }\n        }\n      }\n    } else if (ENABLE_HADES_FAILURE_DETECTION &&\n               time_elapsed_in_sec(stopwatch_for_fd_warmup) > 2) {\n      fd_warmup_time_has_passed = 1;\n      printf(\"~~~~~~~~~ Starting execution! ~~~~~~~~~\\n\");\n    }\n\n    // Failure Detection and Membership\n    if (ENABLE_HADES_FAILURE_DETECTION) {\n      failure_detection_n_membership(ud_channel_ptrs, membership_ptr, &hw_ctx,\n                                     worker_lid);\n\n      if (group_membership_has_changed(&group_membership, worker_lid)) {\n        /// Complete inprogress updates/replays waiting for ACKS only from\n        /// failed nodes\n        hermes_batch_ops_to_KVS(local_ops_after_membership_change,\n                                (uint8_t*)ops, max_batch_size,\n                                sizeof(spacetime_op_t), group_membership, NULL,\n                                NULL, (uint8_t)worker_lid);\n\n        stop_latency_of_completed_writes(ops, worker_lid,\n                                         &stopwatch_for_req_latency);\n\n        if (!DISABLE_VALS_FOR_DEBUGGING)\n          /// Bcast VAL msgs for those completed update/replays\n          has_remaining_vals_from_memb_change = wings_issue_pkts(\n              val_ud_c, membership_ptr, (uint8_t*)ops, max_batch_size,\n              sizeof(spacetime_op_t), NULL, memb_change_skip_or_get_sender_id,\n              memb_change_modify_elem_after_send,\n              memb_change_copy_and_modify_elem);\n      }\n    }\n    w_stats[worker_lid].total_loops++;\n  }\n}\n"
  },
  {
    "path": "src/hermes/main.c",
    "content": "#define _GNU_SOURCE\n#include <getopt.h>\n#include <infiniband/verbs.h>\n#include <malloc.h>\n#include <pthread.h>\n#include <stdio.h>\n#include \"../../include/utils/bit_vector.h\"\n#include \"../../include/utils/concur_ctrl.h\"\n#include \"../../include/wings/wings_api.h\"\n#include \"config.h\"\n#include \"hrd.h\"\n#include \"spacetime.h\"\n#include \"util.h\"\n\n// Global vars\nstruct latency_counters latency_count;\nvolatile struct worker_stats w_stats[MAX_WORKERS_PER_MACHINE];\n\ndbit_vector_t* g_share_qs_barrier;\nspacetime_group_membership group_membership;\n\n// Global config vars\nuint8_t is_CR;\nint num_workers;\nint update_ratio;\nint rmw_ratio;\nint credits_num;\nint max_coalesce;\nint max_batch_size;  // for batches to KVS\n\nint machine_num;\nint remote_machine_num;\nint worker_measuring_latency;\n\n// This is required only when Hades failure detection is disabled\nvoid\ngroup_membership_init(void)\n{\n  group_membership.num_of_alive_remotes = remote_machine_num;\n  seqlock_init(&group_membership.lock);\n  bv_init((bit_vector_t*)&group_membership.g_membership);\n\n  for (uint8_t i = 0; i < machine_num; ++i)\n    bv_bit_set((bit_vector_t*)&group_membership.g_membership, i);\n\n  bv_copy((bit_vector_t*)&group_membership.w_ack_init,\n          group_membership.g_membership);\n  bv_reverse((bit_vector_t*)&group_membership.w_ack_init);\n  bv_bit_set((bit_vector_t*)&group_membership.w_ack_init, (uint8_t)machine_id);\n}\n\nint\nmain(int argc, char* argv[])\n{\n  int i, c;\n  is_roce = -1;\n  machine_id = -1;\n\n  // config vars\n  is_CR = 1;\n  num_workers = -1;\n  update_ratio = -1;\n  rmw_ratio = -1;\n  credits_num = -1;\n  max_coalesce = -1;\n  max_batch_size = -1;\n  remote_IP = (char*)malloc(16 * sizeof(char));\n\n  machine_num = -1;\n  remote_machine_num = -1;\n  worker_measuring_latency = -1;\n\n  //\tgreen_printf(\"UD size: %d ibv_grh + crd size: %d \\n\",\n  // sizeof(ud_req_crd_t), sizeof(struct ibv_grh) + sizeof(spacetime_crd_t));\n  //\tstatic_assert(sizeof(ud_req_crd_t) == sizeof(struct ibv_grh) +\n  // sizeof(spacetime_crd_t), \"\"); ///CRD --> 48 Bytes instead of 43\n\n  struct thread_params* param_arr;\n  pthread_t* thread_arr;\n\n  static struct option opts[] = {\n      {.name = \"machine-id\", .has_arg = 1, .val = 'm'},\n      {.name = \"lat-worker\", .has_arg = 1, .val = 'l'},\n      {.name = \"is-roce\", .has_arg = 1, .val = 'r'},\n      {.name = \"rmw-ratio\", .has_arg = 1, .val = 'R'},\n      {.name = \"dev-name\", .has_arg = 1, .val = 'd'},\n      {.name = \"write-ratio\", .has_arg = 1, .val = 'w'},\n      {.name = \"num-workers\", .has_arg = 1, .val = 'W'},\n      {.name = \"num-machines\", .has_arg = 1, .val = 'M'},\n      {.name = \"credits\", .has_arg = 1, .val = 'c'},\n      {.name = \"max-coalesce\", .has_arg = 1, .val = 'C'},\n      {.name = \"max-batch-size\", .has_arg = 1, .val = 'b'},\n      {.name = \"hermes\", .has_arg = 0, .val = 'H'},\n      {0}};\n\n  /* Parse and check arguments */\n  while (1) {\n    c = getopt_long(argc, argv, \"m:r:l:R:d:w:c:C:W:M:H\", opts, NULL);\n    if (c == -1) break;\n\n    switch (c) {\n      case 'm':\n        machine_id = atoi(optarg);\n        break;\n      case 'r':\n        is_roce = atoi(optarg);\n        break;\n      case 'l':\n        worker_measuring_latency = atoi(optarg);\n        break;\n      case 'd':\n        memcpy(dev_name, optarg, strlen(optarg));\n        break;\n      // Config vars\n      case 'w':\n        update_ratio = atoi(optarg);\n        break;\n      case 'R':\n        rmw_ratio = atoi(optarg);\n        break;\n      case 'W':\n        num_workers = atoi(optarg);\n        break;\n      case 'c':\n        credits_num = atoi(optarg);\n        break;\n      case 'C':\n        max_coalesce = atoi(optarg);\n        break;\n      case 'b':\n        max_batch_size = atoi(optarg);\n        break;\n      case 'H':\n        is_CR = 0;\n        break;\n      case 'M':\n        machine_num = atoi(optarg);\n        remote_machine_num = machine_num - 1;\n        break;\n      default:\n        printf(\"Invalid argument %d\\n\", c);\n        assert(false);\n    }\n  }\n\n  // If arguments not passed use the default values from header file\n  if (update_ratio == -1) update_ratio = DEFAULT_UPDATE_RATIO;\n  if (rmw_ratio == -1) rmw_ratio = ENABLE_RMWs ? DEFAULT_RMW_RATIO : 0;\n  if (num_workers == -1) num_workers = DEFAULT_WORKERS_PER_MACHINE;\n  if (max_coalesce == -1) max_coalesce = MAX_REQ_COALESCE;\n  if (max_batch_size == -1) max_batch_size = MAX_BATCH_KVS_OPS_SIZE;\n  if (credits_num == -1)\n    credits_num = is_CR ? MAX_CREDITS_PER_REMOTE_WORKER_CR\n                        : MAX_CREDITS_PER_REMOTE_WORKER;\n  if (worker_measuring_latency == -1 && DEFAULT_MEASURE_LATENCY)\n    worker_measuring_latency = DEFAULT_WORKER_MEASURING_LATENCY;\n  if (machine_num == -1) {\n    machine_num = MAX_MACHINE_NUM;\n    remote_machine_num = MAX_REMOTE_MACHINES;\n  }\n\n  assert(ENABLE_RMWs || rmw_ratio == 0);\n  assert(rmw_ratio != 0 || ENABLE_RMWs == 0);\n  // WARNING: Some structs are statically allocated using\n  // MAX_WORKERS_PER_MACHINE / MAX_BATCH_KVS_OPS_SIZE\n  assert(num_workers <= MAX_WORKERS_PER_MACHINE);\n  assert(max_batch_size <= MAX_BATCH_KVS_OPS_SIZE);\n  assert(machine_num > 1 && machine_num <= MAX_MACHINE_NUM);\n  assert(worker_measuring_latency == -1 ||\n         worker_measuring_latency < num_workers);\n\n  assert(!ENABLE_VIRTUAL_NODE_IDS || VIRTUAL_NODE_IDS_PER_NODE > machine_num);\n  assert(!ENABLE_VIRTUAL_NODE_IDS ||\n         machine_num * VIRTUAL_NODE_IDS_PER_NODE < 255);\n\n  if (num_workers > 1)\n    dbv_init(&g_share_qs_barrier, (uint8_t)num_workers);\n  else\n    g_share_qs_barrier = NULL;\n\n  printf(\n      \"update rate: %d (RMW rate %d) | workers %d | batch size %d| CREDITS %d \"\n      \"| coalesce %d |\\n\",\n      update_ratio, rmw_ratio, num_workers, max_batch_size, credits_num,\n      max_coalesce);\n\n  thread_arr = malloc(num_workers * sizeof(pthread_t));\n  param_arr = malloc(num_workers * sizeof(struct thread_params));\n\n  pthread_attr_t attr;\n  cpu_set_t cpus_w;\n\n  group_membership_init();\n  init_stats((void*)w_stats);\n  spacetime_init(machine_id);\n\n  pthread_attr_init(&attr);\n  int w_core, init_core = SOCKET_TO_START_SPAWNING_THREADS;\n  for (i = 0; i < num_workers; i++) {\n    if (USE_ALL_SOCKETS && ENABLE_HYPERTHREADING)\n      w_core = init_core + i;\n    else\n      w_core = 2 * i + init_core;\n\n    assert(w_core < TOTAL_HW_CORES);\n    assert(ENABLE_HYPERTHREADING ||\n           w_core < TOTAL_NUMBER_OF_SOCKETS * TOTAL_CORES_PER_SOCKET);\n\n    param_arr[i].id = i;\n\n    CPU_ZERO(&cpus_w);\n    CPU_SET(w_core, &cpus_w);\n    pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpus_w);\n    pthread_create(&thread_arr[i], &attr, run_worker, &param_arr[i]);\n  }\n  colored_printf(YELLOW, \"Sizes: {Op: %d, Object Meta %d, Value %d},\\n\",\n                 sizeof(spacetime_op_t), sizeof(spacetime_object_meta),\n                 ST_VALUE_SIZE);\n  colored_printf(YELLOW, \"Coherence msg Sizes: {Inv: %d, Ack: %d, Val: %d}\\n\",\n                 sizeof(spacetime_inv_t), sizeof(spacetime_ack_t),\n                 sizeof(spacetime_val_t));\n  colored_printf(\n      YELLOW, \"Max Coalesce Packet Sizes: {Inv: %d, Ack: %d, Val: %d}\\n\",\n      sizeof(wings_ud_send_pkt_t) + max_coalesce * sizeof(spacetime_inv_t),\n      sizeof(wings_ud_send_pkt_t) + max_coalesce * sizeof(spacetime_ack_t),\n      sizeof(wings_ud_send_pkt_t) + max_coalesce * sizeof(spacetime_val_t));\n\n  for (i = 0; i < num_workers; i++)\n    pthread_join(thread_arr[i], NULL);\n\n  return 0;\n}\n\n//////////////////////////////////////////////////////////////////////////////////\n/// Static asserts to ensure only correct configs\n//////////////////////////////////////////////////////////////////////////////////\n\nstatic_assert(MICA_MAX_VALUE >= ST_VALUE_SIZE, \"\");\nstatic_assert(MAX_MACHINE_NUM <= 8,\n              \"\");  // TODO haven't test bit vectors with more than 8 nodes\nstatic_assert(MAX_MACHINE_NUM <= GROUP_MEMBERSHIP_ARRAY_SIZE * 8,\n              \"\");  // bit vector for acks / group membership\nstatic_assert(MAX_MACHINE_NUM <= 255, \"\");\n\nstatic_assert(KV_SOCKET < TOTAL_NUMBER_OF_SOCKETS &&\n                  SOCKET_TO_START_SPAWNING_THREADS < TOTAL_NUMBER_OF_SOCKETS,\n              \"\");\n\nstatic_assert((ENABLE_HYPERTHREADING == 1 && USE_ALL_SOCKETS == 1) ||\n                  MAX_WORKERS_PER_MACHINE <= TOTAL_CORES_PER_SOCKET,\n              \"\");\nstatic_assert(MAX_WORKERS_PER_MACHINE <= TOTAL_HW_CORES, \"\");\n\n/// Assertions for failures\nstatic_assert(FAKE_FAILURE == 0 || NODE_TO_FAIL < MAX_MACHINE_NUM, \"\");\nstatic_assert(FAKE_FAILURE == 0 ||\n                  ROUNDS_BEFORE_FAILURE < PRINT_NUM_STATS_BEFORE_EXITING,\n              \"\");\nstatic_assert(FAKE_FAILURE == 0 ||\n                  WORKER_WITH_FAILURE_DETECTOR < MAX_WORKERS_PER_MACHINE,\n              \"\");\n\nstatic_assert(MAX_MACHINE_NUM < TIE_BREAKER_ID_EMPTY, \"\");\nstatic_assert(MAX_MACHINE_NUM < LAST_WRITER_ID_EMPTY, \"\");\nstatic_assert(MAX_BATCH_KVS_OPS_SIZE < ST_OP_BUFFER_INDEX_EMPTY,\n              \"\");  /// 1B write_buffer_index and 255 is used as \"empty\" value\n\n/// Make sure that assigned numbers to States are monotonically increasing with\n/// the following order\nstatic_assert(VALID_STATE < INVALID_STATE, \"\");\nstatic_assert(INVALID_STATE < INVALID_WRITE_STATE, \"\");\nstatic_assert(INVALID_WRITE_STATE < WRITE_STATE, \"\");\nstatic_assert(WRITE_STATE < REPLAY_STATE, \"\");\n\nstatic_assert(ENABLE_RMWs == 0 || ENABLE_RMWs == 1, \"\");\n"
  },
  {
    "path": "src/hermes/spacetime.c",
    "content": "//\n// Created by akatsarakis on 04/05/18.\n//\n#include <config.h>\n#include <inline-util.h>\n#include <spacetime.h>\n#include <util.h>\n#include \"../../include/utils/concur_ctrl.h\"\n\n/*\n * Initialize the spacetime using a Mica instances and adding the timestamps\n * and locks to the keys of mica-herd-herd structure\n */\n\nstruct spacetime_kv kv;\n\nvoid\nspacetime_object_meta_init(spacetime_object_meta* ol)\n{\n  cctrl_init(&ol->cctrl);\n  ol->state = VALID_STATE;\n  ol->last_writer_id = LAST_WRITER_ID_EMPTY;\n  ol->op_buffer_index = ST_OP_BUFFER_INDEX_EMPTY;\n}\n\nvoid\nspacetime_init(int instance_id)\n{\n  // TODO may add kvs stats\n  mica_init(&kv.hash_table, instance_id, KV_SOCKET, SPACETIME_NUM_BKTS,\n            SPACETIME_LOG_CAP);\n  spacetime_populate_fixed_len(&kv, SPACETIME_NUM_KEYS, KVS_VALUE_SIZE);\n}\n\nvoid\nspacetime_populate_fixed_len(struct spacetime_kv* _kv, int n, int val_len)\n{\n  assert(n > 0);\n  assert(val_len > 0 && val_len <= KVS_VALUE_SIZE);\n\n  /* This is needed for the eviction message below to make sense */\n  assert(_kv->hash_table.num_insert_op == 0 &&\n         _kv->hash_table.num_index_evictions == 0);\n\n  struct mica_op op;\n  struct mica_resp resp;\n  unsigned long long* op_key = (unsigned long long*)&op.key;\n  spacetime_object_meta initial_meta;\n  spacetime_object_meta_init(&initial_meta);\n\n  /* Generate the keys to insert */\n  uint128* key_arr = mica_gen_keys(n);\n  op.val_len = (uint8_t)(val_len >> SHIFT_BITS);\n  op.opcode = ST_OP_PUT;\n  spacetime_object_meta* value_ptr = (spacetime_object_meta*)op.value;\n  memcpy((void*)value_ptr, (void*)&initial_meta, sizeof(spacetime_object_meta));\n  for (int i = n - 1; i >= 0; i--) {\n    op_key[0] = key_arr[i].first;\n    op_key[1] = key_arr[i].second;\n    /// printf(\"Key Metadata: Lock(%u), State(%u), Counter(%u:%u)\\n\",\n    /// op.key.meta.lock,\n    /// op.key.meta.state, op.key.meta.version, op.key.meta.cid);\n    uint8_t val = (uint8_t)('a' + (i % 20));\n\n    memset((void*)&value_ptr[1], val, ST_VALUE_SIZE);\n    mica_insert_one(&_kv->hash_table, &op, &resp);\n  }\n\n  assert(_kv->hash_table.num_insert_op == n);\n  colored_printf(YELLOW,\n                 \"Spacetime: Populated instance %d with %d keys, length = %d. \"\n                 \"Index eviction fraction = %.4f.\\n\",\n                 _kv->hash_table.instance_id, n, val_len,\n                 (double)_kv->hash_table.num_index_evictions /\n                     _kv->hash_table.num_insert_op);\n}\n"
  },
  {
    "path": "src/hermes/stats.c",
    "content": "#include \"util.h\"\n\nstatic inline void\nxput_file_name(char* filename)\n{\n  char* path = \"./results/xput/per-node\";\n\n  sprintf(filename, \"%s/%s_xPut_m_%d_wr_%.1f_rmw_%.1f_wk_%d_b_%d_c_%d%s-%d.txt\",\n          path, is_CR == 1 ? \"CR\" : \"Hermes\", machine_num, update_ratio / 10.0,\n          rmw_ratio / 10.0, num_workers, max_batch_size, credits_num,\n          FEED_FROM_TRACE == 1 ? \"_a_0.99\" : \"_uni\", machine_id);\n}\n\n// assuming microsecond latency\nvoid\ndump_xput_stats(double xput_in_miops)\n{\n  static uint8_t no_func_calls = 0;  /// WARNING this is not thread safe.\n\n  assert(no_func_calls < 250);\n\n  FILE* xput_stats_fd;\n  char filename[128];\n  xput_file_name(filename);\n\n  const char* open_mode = no_func_calls == 0 ? \"w\" : \"a\";\n  xput_stats_fd = fopen(filename, open_mode);\n\n  fprintf(xput_stats_fd, \"node%d_miops-%d: %.2f\\n\", machine_id, no_func_calls,\n          xput_in_miops);\n\n  fclose(xput_stats_fd);\n  no_func_calls++;\n\n  //    printf(\"xPut stats saved at %s\\n\", filename);\n}\n\n// assuming microsecond latency\nvoid\ndump_latency_stats(void)\n{\n  FILE* latency_stats_fd;\n  char filename[128];\n  char* path = \"./results/latency\";\n\n  sprintf(filename, \"%s/%s_latency_m_%d_w_%d_b_%d_wr_%d_rmw_%d_c_%d%s.csv\",\n          path, is_CR == 1 ? \"CR\" : \"Hermes\", machine_num, num_workers,\n          max_batch_size, update_ratio, rmw_ratio, credits_num,\n          FEED_FROM_TRACE == 1 ? \"_a_0.99\" : \"\");\n\n  latency_stats_fd = fopen(filename, \"w\");\n  fprintf(latency_stats_fd, \"#---------------- Read Reqs --------------\\n\");\n  for (int i = 0; i < LATENCY_BUCKETS; ++i)\n    fprintf(latency_stats_fd, \"reads: %d, %d\\n\", i * LATENCY_PRECISION,\n            latency_count.read_reqs[i]);\n  fprintf(latency_stats_fd, \"reads: -1, %d\\n\",\n          latency_count.read_reqs[LATENCY_BUCKETS]);  // print outliers\n  fprintf(latency_stats_fd, \"reads-hl: %d\\n\",\n          latency_count.max_read_latency);  // print max read latency\n\n  fprintf(latency_stats_fd, \"#---------------- Write Reqs ---------------\\n\");\n  for (int i = 0; i < LATENCY_BUCKETS; ++i)\n    fprintf(latency_stats_fd, \"writes: %d, %d\\n\", i * LATENCY_PRECISION,\n            latency_count.write_reqs[i]);\n  fprintf(latency_stats_fd, \"writes: -1, %d\\n\",\n          latency_count.write_reqs[LATENCY_BUCKETS]);  // print outliers\n  fprintf(latency_stats_fd, \"writes-hl: %d\\n\",\n          latency_count.max_write_latency);  // print max write latency\n\n  fclose(latency_stats_fd);\n\n  printf(\"Latency stats saved at %s\\n\", filename);\n}\n\nstatic inline double\nsafe_division(double a, double b)\n{\n  return b == 0 ? 0 : a / b;\n}\n\nvoid*\nprint_stats_thread(void* no_arg)\n{\n  uint16_t i, print_count = 0;\n  long long all_worker_xput = 0;\n  long long all_worker_wrs = 0;\n  long long all_worker_rmws = 0;\n  long long all_worker_aborted_rmws = 0;\n  double total_throughput = 0;\n  double total_rd_throughput = 0;\n  double total_rmw_aborts = 0;\n\n  double total_wr_throughput = 0;\n  double total_rmw_throughput = 0;\n  //    int sleep_time = 20;\n  struct worker_stats curr_w_stats[MAX_WORKERS_PER_MACHINE],\n      prev_w_stats[MAX_WORKERS_PER_MACHINE];\n  struct stats all_stats;\n  sleep(4);\n  memcpy(prev_w_stats, (void*)w_stats,\n         MAX_WORKERS_PER_MACHINE * (sizeof(struct worker_stats)));\n  struct timespec start, end;\n  clock_gettime(CLOCK_REALTIME, &start);\n  while (true) {\n    usleep(PRINT_STATS_EVERY_MSECS * 1000);\n    clock_gettime(CLOCK_REALTIME, &end);\n    double seconds = (end.tv_sec - start.tv_sec) +\n                     (double)(end.tv_nsec - start.tv_nsec) / 1000000001;\n    start = end;\n    memcpy(curr_w_stats, (void*)w_stats,\n           MAX_WORKERS_PER_MACHINE * (sizeof(struct worker_stats)));\n    all_worker_xput = 0;\n    all_worker_wrs = 0;\n    all_worker_rmws = 0;\n    all_worker_aborted_rmws = 0;\n    print_count++;\n    if (FAKE_FAILURE == 1 && machine_id == NODE_TO_FAIL &&\n        print_count == ROUNDS_BEFORE_FAILURE) {\n      colored_printf(RED, \"---------------------------------------\\n\");\n      colored_printf(RED, \"------------  NODE FAILED  ------------\\n\");\n      colored_printf(RED, \"---------------------------------------\\n\");\n      exit(0);\n    }\n    if (EXIT_ON_STATS_PRINT == 1 &&\n        print_count == PRINT_NUM_STATS_BEFORE_EXITING) {\n      if (worker_measuring_latency != -1 && machine_id == 0)\n        dump_latency_stats();\n      if (DUMP_XPUT_STATS_TO_FILE) {\n        char filename[128];\n        xput_file_name(filename);\n        printf(\"xPut stats (of this node) saved at %s\\n\", filename);\n      }\n      printf(\"---------------------------------------\\n\");\n      printf(\"------------ RUN FINISHED -------------\\n\");\n      printf(\"---------------------------------------\\n\");\n      exit(0);\n    }\n    seconds *= MILLION;  // compute only MIOPS\n    for (i = 0; i < num_workers; i++) {\n      all_worker_xput += curr_w_stats[i].completed_ops_per_worker -\n                         prev_w_stats[i].completed_ops_per_worker;\n      all_worker_wrs += curr_w_stats[i].completed_wrs_per_worker -\n                        prev_w_stats[i].completed_wrs_per_worker;\n      all_worker_rmws += curr_w_stats[i].completed_rmws_per_worker -\n                         prev_w_stats[i].completed_rmws_per_worker;\n      all_worker_aborted_rmws += curr_w_stats[i].aborted_rmws_per_worker -\n                                 prev_w_stats[i].aborted_rmws_per_worker;\n      all_stats.xput_per_worker[i] =\n          (curr_w_stats[i].completed_ops_per_worker -\n           prev_w_stats[i].completed_ops_per_worker) /\n          seconds;\n      all_stats.rmw_xput_per_worker[i] =\n          (curr_w_stats[i].completed_rmws_per_worker -\n           prev_w_stats[i].completed_rmws_per_worker) /\n          seconds;\n      all_stats.rmw_abort_rate_per_worker[i] =\n          safe_division((curr_w_stats[i].aborted_rmws_per_worker -\n                         prev_w_stats[i].aborted_rmws_per_worker),\n                        (curr_w_stats[i].completed_rmws_per_worker -\n                         prev_w_stats[i].completed_rmws_per_worker));\n    }\n\n    memcpy(prev_w_stats, curr_w_stats,\n           MAX_WORKERS_PER_MACHINE * (sizeof(struct worker_stats)));\n    total_throughput = all_worker_xput / seconds;\n    total_wr_throughput = all_worker_wrs / seconds;\n    total_rmw_throughput = all_worker_rmws / seconds;\n    total_rmw_aborts = safe_division(all_worker_aborted_rmws, all_worker_rmws);\n    total_rd_throughput =\n        total_throughput - total_wr_throughput - total_rmw_throughput;\n    printf(\"---------------PRINT %d time elapsed %.2f---------------\\n\",\n           print_count, seconds / MILLION);\n    colored_printf(GREEN,\n                   \"NODE MReqs/s: %.2f \\n(Rd|Wr|RMW: %.2f|%.2f|%.2f) | RMW \"\n                   \"aborts: %.2f%%)\\n\",\n                   total_throughput, total_rd_throughput, total_wr_throughput,\n                   total_rmw_throughput, 100 * total_rmw_aborts);\n    if (PRINT_WORKER_STATS) {\n      for (i = 0; i < num_workers; i++) {\n        //            yellow_printf(\"W%d: %.2f MIOPS-Batch %.2f(%.2f) -H %.2f -W\n        //            %llu -E %.2f -AC %.2f \\n\", i,\n        //            all_stats.xput_per_worker[i],\n        //            all_stats.batch_size_per_worker[i],\n        //                          all_stats.stalled_time_per_worker[i],\n        //                          trace_ratio, curr_w_stats[i].wasted_loops,\n        //                          all_stats.empty_reqs_per_worker[i],\n        //                          all_stats.average_coalescing_per_worker[i]);\n        all_stats.issued_invs_avg_coalesing[i] =\n            w_stats[i].issued_invs_per_worker /\n            (double)w_stats[i].issued_packet_invs_per_worker;\n        all_stats.issued_acks_avg_coalesing[i] =\n            w_stats[i].issued_acks_per_worker /\n            (double)w_stats[i].issued_packet_acks_per_worker;\n        all_stats.issued_vals_avg_coalesing[i] =\n            w_stats[i].issued_vals_per_worker /\n            (double)w_stats[i].issued_packet_vals_per_worker;\n        all_stats.issued_crds_avg_coalesing[i] =\n            w_stats[i].issued_crds_per_worker /\n            (double)w_stats[i].issued_packet_crds_per_worker;\n\n        all_stats.received_invs_avg_coalesing[i] =\n            w_stats[i].received_invs_per_worker /\n            (double)w_stats[i].received_packet_invs_per_worker;\n        all_stats.received_acks_avg_coalesing[i] =\n            w_stats[i].received_acks_per_worker /\n            (double)w_stats[i].received_packet_acks_per_worker;\n        all_stats.received_vals_avg_coalesing[i] =\n            w_stats[i].received_vals_per_worker /\n            (double)w_stats[i].received_packet_vals_per_worker;\n        all_stats.received_crds_avg_coalesing[i] =\n            w_stats[i].received_crds_per_worker /\n            (double)w_stats[i].received_packet_crds_per_worker;\n\n        all_stats.percentage_of_wasted_loops[i] =\n            w_stats[i].wasted_loops / (double)w_stats[i].total_loops * 100;\n        all_stats.completed_reqs_per_loop[i] =\n            curr_w_stats[i].completed_ops_per_worker /\n            (double)w_stats[i].total_loops;\n        colored_printf(CYAN, \"W%d: \", i);\n        colored_printf(YELLOW,\n                       \"%.2f MIOPS, Coalescing{Inv: %.2f, Ack: %.2f, Val: \"\n                       \"%.2f, Crd: %.2f}\\n\",\n                       all_stats.xput_per_worker[i],\n                       all_stats.issued_invs_avg_coalesing[i],\n                       all_stats.issued_acks_avg_coalesing[i],\n                       all_stats.issued_vals_avg_coalesing[i],\n                       all_stats.issued_crds_avg_coalesing[i]);\n        colored_printf(YELLOW,\n                       \"\\t wasted_loops: %.2f%, reqs per loop: %.2f, total \"\n                       \"reqs %d, reqs missed: %d\\n\",\n                       all_stats.percentage_of_wasted_loops[i],\n                       all_stats.completed_reqs_per_loop[i],\n                       curr_w_stats[i].completed_ops_per_worker,\n                       curr_w_stats[i].reqs_missed_in_kvs);\n      }\n      colored_printf(GREEN, \"NODE MReqs/s: %.2f \\n\", total_throughput);\n      printf(\"---------------------------------------\\n\");\n    }\n\n    if (DUMP_XPUT_STATS_TO_FILE) dump_xput_stats(total_throughput);\n  }\n}\n"
  },
  {
    "path": "src/hermes/util.c",
    "content": "//\n// Created by akatsarakis on 15/03/18.\n//\n#define _GNU_SOURCE\n\n#include \"util.h\"\n#include \"hrd.h\"\n#include \"inline-util.h\"\n#include \"spacetime.h\"\n\nint\nspawn_stats_thread(void)\n{\n  pthread_t* thread_arr = malloc(sizeof(pthread_t));\n  pthread_attr_t attr;\n  cpu_set_t cpus_stats;\n  pthread_attr_init(&attr);\n  CPU_ZERO(&cpus_stats);\n\n  if (DEFAULT_THREAD_OF_STAT_THREAD != -1) {\n    CPU_SET(DEFAULT_THREAD_OF_STAT_THREAD, &cpus_stats);\n  } else {\n    if (MAX_WORKERS_PER_MACHINE > 17)\n      CPU_SET(39, &cpus_stats);\n    else\n      CPU_SET(2 * MAX_WORKERS_PER_MACHINE + 2, &cpus_stats);\n  }\n\n  pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpus_stats);\n  return pthread_create(&thread_arr[0], &attr, print_stats_thread, NULL);\n}\n\nuint8_t\nis_state_code(uint8_t code)\n{\n  switch (code) {\n    // Object States\n    case VALID_STATE:\n    case WRITE_STATE:\n    case REPLAY_STATE:\n    case INVALID_STATE:\n    case INVALID_WRITE_STATE:\n      return 1;\n    default:\n      return 0;\n  }\n}\n\nuint8_t\nis_input_code(uint8_t code)\n{\n  switch (code) {\n      // Input opcodes\n    case ST_OP_GET:\n    case ST_OP_PUT:\n    case ST_OP_RMW:\n    case ST_OP_INV:\n    case ST_OP_ACK:\n    case ST_OP_VAL:\n    case ST_OP_CRD:\n    case ST_OP_MEMBERSHIP_CHANGE:\n    case ST_OP_MEMBERSHIP_COMPLETE:\n      return 1;\n    default:\n      return 0;\n  }\n}\n\nuint8_t\nis_response_code(uint8_t code)\n{\n  switch (code) {\n    case ST_GET_COMPLETE:\n    case ST_PUT_SUCCESS:\n    case ST_PUT_COMPLETE:\n    case ST_REPLAY_SUCCESS:\n    case ST_REPLAY_COMPLETE:\n    case ST_INV_SUCCESS:\n    case ST_ACK_SUCCESS:\n    case ST_VAL_SUCCESS:\n    case ST_LAST_ACK_SUCCESS:\n    case ST_LAST_ACK_NO_BCAST_SUCCESS:\n    case ST_MISS:\n    case ST_GET_STALL:\n    case ST_PUT_STALL:\n    case ST_PUT_COMPLETE_SEND_VALS:\n    case ST_INV_OUT_OF_GROUP:\n    // RMW\n    case ST_RMW_ABORT:\n    case ST_RMW_STALL:\n    case ST_RMW_SUCCESS:\n    case ST_RMW_COMPLETE:\n      return 1;\n    default:\n      return 0;\n  }\n}\n\nuint8_t\nis_bucket_state_code(uint8_t code)\n{\n  switch (code) {\n    case ST_NEW:\n    case ST_EMPTY:\n    case ST_COMPLETE:\n    case ST_IN_PROGRESS_GET:\n    case ST_IN_PROGRESS_PUT:\n    case ST_IN_PROGRESS_RMW:\n    case ST_IN_PROGRESS_REPLAY:\n      return 1;\n    default:\n      return 0;\n  }\n}\n\nchar*\ncode_to_str(uint8_t code)\n{\n  switch (code) {\n    // Object States\n    case VALID_STATE:\n      return \"VALID_STATE\";\n    case INVALID_STATE:\n      return \"INVALID_STATE\";\n    case INVALID_WRITE_STATE:\n      return \"INVALID_WRITE_STATE\";\n    case WRITE_STATE:\n      return \"WRITE_STATE\";\n    case REPLAY_STATE:\n      return \"REPLAY_STATE\";\n    // Input opcodes\n    case ST_OP_GET:\n      return \"ST_OP_GET\";\n    case ST_OP_PUT:\n      return \"ST_OP_PUT\";\n    case ST_OP_RMW:\n      return \"ST_OP_RMW\";\n    case ST_OP_INV:\n      return \"ST_OP_INV\";\n    case ST_OP_INV_ABORT:\n      return \"ST_OP_INV_ABORT\";\n    case ST_OP_ACK:\n      return \"ST_OP_ACK\";\n    case ST_OP_VAL:\n      return \"ST_OP_VAL\";\n    case ST_OP_CRD:\n      return \"ST_OP_CRD\";\n    case ST_OP_MEMBERSHIP_CHANGE:\n      return \"ST_OP_MEMBERSHIP_CHANGE\";\n    case ST_OP_MEMBERSHIP_COMPLETE:\n      return \"ST_OP_MEMBERSHIP_COMPLETE\";\n    // Response opcodes\n    case ST_GET_COMPLETE:\n      return \"ST_GET_COMPLETE\";\n    case ST_PUT_SUCCESS:\n      return \"ST_PUT_SUCCESS\";\n    case ST_PUT_COMPLETE:\n      return \"ST_PUT_COMPLETE\";\n    case ST_RMW_SUCCESS:\n      return \"ST_RMW_SUCCESS\";\n    case ST_RMW_COMPLETE:\n      return \"ST_RMW_COMPLETE\";\n    case ST_REPLAY_SUCCESS:\n      return \"ST_REPLAY_SUCCESS\";\n    case ST_REPLAY_COMPLETE:\n      return \"ST_REPLAY_COMPLETE\";\n    case ST_INV_SUCCESS:\n      return \"ST_INV_SUCCESS\";\n    case ST_ACK_SUCCESS:\n      return \"ST_ACK_SUCCESS\";\n    case ST_VAL_SUCCESS:\n      return \"ST_VAL_SUCCESS\";\n    case ST_LAST_ACK_SUCCESS:\n      return \"ST_LAST_ACK_SUCCESS\";\n    case ST_LAST_ACK_NO_BCAST_SUCCESS:\n      return \"ST_LAST_ACK_NO_BCAST_SUCCESS\";\n    case ST_MISS:\n      return \"\\033[31mST_MISS\\033[0m\";\n    case ST_GET_STALL:\n      return \"ST_GET_STALL\";\n    case ST_PUT_STALL:\n      return \"ST_PUT_STALL\";\n    case ST_RMW_STALL:\n      return \"ST_RMW_STALL\";\n    case ST_RMW_ABORT:\n      return \"ST_RMW_ABORT\";\n    case ST_PUT_COMPLETE_SEND_VALS:\n      return \"ST_PUT_COMPLETE_SEND_VALS\";\n    case ST_RMW_COMPLETE_SEND_VALS:\n      return \"ST_RMW_COMPLETE_SEND_VALS\";\n    case ST_REPLAY_COMPLETE_SEND_VALS:\n      return \"ST_REPLAY_COMPLETE_SEND_VALS\";\n    case ST_INV_OUT_OF_GROUP:\n      return \"ST_INV_OUT_OF_GROUP\";\n    case ST_SEND_CRD:\n      return \"ST_SEND_CRD\";\n    // Ops bucket states\n    case ST_EMPTY:\n      return \"ST_EMPTY\";\n    case ST_NEW:\n      return \"ST_NEW\";\n    case ST_IN_PROGRESS_PUT:\n      return \"ST_IN_PROGRESS_PUT\";\n    case ST_IN_PROGRESS_RMW:\n      return \"ST_IN_PROGRESS_RMW\";\n    case ST_IN_PROGRESS_REPLAY:\n      return \"ST_IN_PROGRESS_REPLAY\";\n    case ST_COMPLETE:\n      return \"ST_COMPLETE\";\n    // Buffer Types\n    case ST_INV_BUFF:\n      return \"ST_INV_BUFF\";\n    case ST_ACK_BUFF:\n      return \"ST_ACK_BUFF\";\n    case ST_VAL_BUFF:\n      return \"ST_VAL_BUFF\";\n    case ST_CRD_BUFF:\n      return \"ST_CRD_BUFF\";\n    case NOP:\n      return \"NOP\";\n    // Failure related\n    case ST_OP_HEARTBEAT:\n      return \"ST_OP_HEARTBEAT\";\n    case ST_OP_SUSPICION:\n      return \"ST_OP_SUSPICION\";\n    default: {\n      printf(\"Wrong code (%d)\\n\", code);\n      assert(0);\n    }\n  }\n}\n\n// Creates a trace with a uniform distribution without a backing file\nvoid\ncreate_uni_trace(struct spacetime_trace_command** cmds, int worker_gid)\n{\n  srand(time(NULL) + worker_gid * 7);\n  *cmds =\n      malloc((NUM_OF_REP_REQS + 1) * sizeof(struct spacetime_trace_command));\n  int rmws = 0;\n\n  uint32_t i, writes = 0;\n  // parse file line by line and insert trace to cmd.\n  for (i = 0; i < NUM_OF_REP_REQS; i++) {\n    // Before reading the request deside if it's gone be read or write\n    (*cmds)[i].opcode =\n        (uint8_t)(update_ratio == 1000 || ((rand() % 1000 < update_ratio))\n                      ? ST_OP_PUT\n                      : ST_OP_GET);\n\n    if (ENABLE_RMWs && (*cmds)[i].opcode == ST_OP_PUT)\n      (*cmds)[i].opcode =\n          (uint8_t)(rmw_ratio == 1000 || ((rand() % 1000 < rmw_ratio))\n                        ? ST_OP_RMW\n                        : ST_OP_PUT);\n\n    if ((*cmds)[i].opcode == ST_OP_RMW) rmws++;\n    if ((*cmds)[i].opcode == ST_OP_PUT) writes++;\n\n    //--- KEY ID----------\n    uint32 key_id = KEY_NUM != 0 ? (uint32)rand() % KEY_NUM\n                                 : (uint32)rand() % SPACETIME_NUM_KEYS;\n    if (USE_A_SINGLE_KEY == 1) key_id = 0;\n    uint128 key_hash = CityHash128((char*)&(key_id), 4);\n    //        memcpy(&(*cmds)[i].key_hash, &key_hash, 16); // this is for 16B\n    //        keys\n    memcpy(&(*cmds)[i].key_hash, &((uint64_t*)&key_hash)[1], 8);\n    (*cmds)[i].key_id =\n        (uint8_t)(key_id < 255 ? key_id : ST_KEY_ID_255_OR_HIGHER);\n  }\n\n  if (worker_gid % num_workers == 0)\n    printf(\n        \"Update Ratio: %.2f%% (Writes|RMWs: %.2f%%|%.2f%%)\\n\"\n        \"Trace w_size %d \\n\",\n        (double)((writes + rmws) * 100) / NUM_OF_REP_REQS,\n        (double)(writes * 100) / NUM_OF_REP_REQS,\n        (double)(rmws * 100) / NUM_OF_REP_REQS, NUM_OF_REP_REQS);\n  (*cmds)[NUM_OF_REP_REQS].opcode = NOP;\n  // printf(\"CLient %d Trace w_size: %d, debug counter %d hot keys %d, cold keys\n  // %d \\n\",l_id, cmd_count, debug_cnt,\n  //         t_stats[l_id].hot_keys_per_trace, t_stats[l_id].cold_keys_per_trace\n  //         );\n}\n\n// Parse a trace, use this only for skewed workloads as uniform trace can be\n// created (see create_uni_trace)\nint\nparse_trace(char* path, struct spacetime_trace_command** cmds, int worker_gid)\n{\n  FILE* fp;\n  ssize_t read;\n  size_t len = 0;\n  char* ptr;\n  char* word;\n  char* saveptr;\n  char* line = NULL;\n  int rmws = 0;\n  int writes = 0;\n  int cmd_count = 0;\n  uint32_t hottest_key_counter = 0;\n  uint32_t ten_hottest_keys_counter = 0;\n  uint32_t twenty_hottest_keys_counter = 0;\n\n  fp = fopen(path, \"r\");\n  if (fp == NULL) {\n    printf(\"ERROR: Cannot open file: %s\\n\", path);\n    exit(EXIT_FAILURE);\n  }\n\n  while ((read = getline(&line, &len, fp)) != -1)\n    cmd_count++;\n\n  //    printf(\"File %s has %d lines \\n\", path, cmd_count);\n\n  fclose(fp);\n  if (line) free(line);\n\n  len = 0;\n  line = NULL;\n\n  fp = fopen(path, \"r\");\n  if (fp == NULL) {\n    printf(\"ERROR: Cannot open file: %s\\n\", path);\n    exit(EXIT_FAILURE);\n  }\n\n  (*cmds) = malloc((cmd_count + 1) * sizeof(struct spacetime_trace_command));\n\n  // Initialize random with a seed based on local time and a worker / machine id\n  srand((unsigned int)(time(NULL) + worker_gid * 7));\n\n  int debug_cnt = 0;\n  // parse file line by line and insert trace to cmd.\n  for (int i = 0; i < cmd_count; i++) {\n    if ((read = getline(&line, &len, fp)) == -1) {\n      printf(\"ERROR: Problem while reading the trace!\\n\");\n      exit(1);\n    }\n    int word_count = 0;\n    assert(word_count == 0);\n    word = strtok_r(line, \" \", &saveptr);\n\n    // Before reading the request deside if it's gone be read or write\n    (*cmds)[i].opcode =\n        (uint8_t)(update_ratio == 1000 || ((rand() % 1000 < update_ratio))\n                      ? ST_OP_PUT\n                      : ST_OP_GET);\n\n    if (ENABLE_RMWs && (*cmds)[i].opcode == ST_OP_PUT)\n      (*cmds)[i].opcode =\n          (uint8_t)(rmw_ratio == 1000 || ((rand() % 1000 < rmw_ratio))\n                        ? ST_OP_RMW\n                        : ST_OP_PUT);\n\n    if ((*cmds)[i].opcode == ST_OP_PUT) writes++;\n    if ((*cmds)[i].opcode == ST_OP_RMW) rmws++;\n\n    while (word != NULL) {\n      if (word[strlen(word) - 1] == '\\n') word[strlen(word) - 1] = 0;\n\n      if (word_count == 0) {\n        uint32_t key_id = (uint32_t)strtoul(word, &ptr, 10);\n        if (key_id == 0) hottest_key_counter++;\n        if (key_id < 10) ten_hottest_keys_counter++;\n        if (key_id < 20) twenty_hottest_keys_counter++;\n        uint128 key_hash = CityHash128((char*)&(key_id), 4);\n        //              memcpy(&(*cmds)[i].key_hash, &key_hash, 16); // this is\n        //              for 16B keys\n        memcpy(&(*cmds)[i].key_hash, &((uint64_t*)&key_hash)[1],\n               8);  // this is for 8B keys\n        (*cmds)[i].key_id =\n            (uint8_t)(key_id < 255 ? key_id : ST_KEY_ID_255_OR_HIGHER);\n        debug_cnt++;\n      }\n\n      word_count++;\n      word = strtok_r(NULL, \" \", &saveptr);\n      if (word == NULL && word_count != 1) {\n        printf(\"Client %d Error: Reached word %d in line %d : %s \\n\",\n               worker_gid, word_count, i, line);\n        assert(0);\n      }\n    }\n  }\n\n  if (worker_gid % num_workers == 0) {\n    printf(\n        \"Trace size: %d | Hottest key (10 | 20 keys): %.2f%% (%.2f | %.2f \"\n        \"%%)\\n\",\n        cmd_count, (100 * hottest_key_counter / (double)cmd_count),\n        (100 * ten_hottest_keys_counter / (double)cmd_count),\n        (100 * twenty_hottest_keys_counter / (double)cmd_count));\n    printf(\"Update Ratio: %.2f%% (Writes|RMWs: %.2f%%|%.2f%%)\\n\",\n           (double)((writes + rmws) * 100) / cmd_count,\n           (double)(writes * 100) / cmd_count,\n           (double)(rmws * 100) / cmd_count);\n  }\n  (*cmds)[cmd_count].opcode = NOP;\n  // printf(\"Thread %d Trace w_size: %d, debug counter %d hot keys %d, cold keys\n  // %d \\n\",l_id, cmd_count, debug_cnt,\n  //         t_stats[l_id].hot_keys_per_trace, t_stats[l_id].cold_keys_per_trace\n  //         );\n  assert(cmd_count == debug_cnt);\n  fclose(fp);\n  if (line) free(line);\n  return cmd_count;\n}\n\nvoid\ntrace_init(struct spacetime_trace_command** trace, uint16_t worker_gid)\n{\n  // create the trace path path\n  if (FEED_FROM_TRACE == 1) {\n    char local_client_id[6];\n    char machine_num[4];\n    // get / create path for the trace\n    sprintf(local_client_id, \"%d\", worker_gid % num_workers);\n    sprintf(machine_num, \"%d\", machine_id);\n    char path[2048];\n    char cwd[1024];\n    char* was_successful = getcwd(cwd, sizeof(cwd));\n\n    if (!was_successful) {\n      printf(\"ERROR: getcwd failed!\\n\");\n      exit(EXIT_FAILURE);\n    }\n\n    double zipf_exponent = ZIPF_EXPONENT_OF_TRACE / 100.0;\n\n    snprintf(path, sizeof(path), \"%s%s%04d%s%.2f%s\", cwd,\n             \"/../../traces/current-splitted-traces/t_\", worker_gid, \"_a_\",\n             zipf_exponent, \".txt\");\n\n    // initialize the command array from the trace file\n    parse_trace(path, trace, worker_gid);\n  } else\n    create_uni_trace(trace, worker_gid);\n}\n\n// set up the OPS buffers\nvoid\nsetup_kvs_buffs(spacetime_op_t** ops, spacetime_inv_t** inv_recv_ops,\n                spacetime_ack_t** ack_recv_ops, spacetime_val_t** val_recv_ops)\n{\n  *ops = memalign(4096, MAX_BATCH_KVS_OPS_SIZE * (sizeof(spacetime_op_t)));\n  memset(*ops, 0, MAX_BATCH_KVS_OPS_SIZE * (sizeof(spacetime_op_t)));\n  assert(ops != NULL);\n\n  // Dirty way to support ACKs that might be as big as INVs\n  uint16_t ack_size =\n      ENABLE_RMWs ? sizeof(spacetime_inv_t) : sizeof(spacetime_ack_t);\n  spacetime_inv_t** rmw_ack_r_ops = (spacetime_inv_t**)ack_recv_ops;\n  /// Network ops\n  /// TODO should we memalign aswell?\n\n  uint32_t no_ops =\n      (uint32_t)(credits_num * MAX_REMOTE_MACHINES *\n                 max_coalesce);  // credits * remote_machines * max_req_coalesce\n  //    uint32_t no_ops = (uint32_t) (credits_num * remote_machine_num *\n  //    max_coalesce); //credits * remote_machines * max_req_coalesce\n  *inv_recv_ops = (spacetime_inv_t*)malloc(no_ops * sizeof(spacetime_inv_t));\n  *ack_recv_ops = (spacetime_ack_t*)malloc(no_ops * ack_size);\n  *val_recv_ops = (spacetime_val_t*)malloc(\n      no_ops *\n      sizeof(spacetime_val_t)); /* Batch of incoming broadcasts for the Cache*/\n  assert(*inv_recv_ops != NULL && *ack_recv_ops != NULL &&\n         *val_recv_ops != NULL);\n\n  memset(*inv_recv_ops, 0, no_ops * sizeof(spacetime_inv_t));\n  memset(*ack_recv_ops, 0, no_ops * sizeof(spacetime_ack_t));\n  memset(*val_recv_ops, 0, no_ops * sizeof(spacetime_val_t));\n\n  for (int i = 0; i < no_ops; ++i) {\n    (*val_recv_ops)[i].opcode = ST_EMPTY;\n    (*inv_recv_ops)[i].op_meta.opcode = ST_EMPTY;\n    if (ENABLE_RMWs == 0)\n      (*ack_recv_ops)[i].opcode = ST_EMPTY;\n    else\n      (*rmw_ack_r_ops)[i].op_meta.opcode = ST_EMPTY;\n  }\n\n  for (int i = 0; i < MAX_BATCH_KVS_OPS_SIZE; ++i) {\n    (*ops)[i].op_meta.opcode = ST_EMPTY;\n    (*ops)[i].op_meta.state = ST_EMPTY;\n  }\n}\n"
  },
  {
    "path": "src/mica-herd/city.c",
    "content": "// city.c - cityhash-c\n// CityHash on C\n// Copyright (c) 2011-2012, Alexander Nusov\n//\n// - original copyright notice -\n// Copyright (c) 2011 Google, Inc.\n//\n// Permission is hereby granted, free of charge, to any person obtaining a copy\n// of this software and associated documentation files (the \"Software\"), to deal\n// in the Software without restriction, including without limitation the rights\n// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n// copies of the Software, and to permit persons to whom the Software is\n// furnished to do so, subject to the following conditions:\n//\n// The above copyright notice and this permission notice shall be included in\n// all copies or substantial portions of the Software.\n//\n// THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n// THE SOFTWARE.\n//\n// CityHash, by Geoff Pike and Jyrki Alakuijala\n//\n// This file provides CityHash64() and related functions.\n//\n// It's probably possible to create even faster hash functions by\n// writing a program that systematically explores some of the space of\n// possible hash functions, by using SIMD instructions, or by\n// compromising on hash quality.\n\n#include \"city.h\"\n#include <string.h>\n\nstatic uint64\nUNALIGNED_LOAD64(const char* p)\n{\n  uint64 result;\n  memcpy(&result, p, sizeof(result));\n  return result;\n}\n\nstatic uint32\nUNALIGNED_LOAD32(const char* p)\n{\n  uint32 result;\n  memcpy(&result, p, sizeof(result));\n  return result;\n}\n\n#if !defined(WORDS_BIGENDIAN)\n\n#define uint32_in_expected_order(x) (x)\n#define uint64_in_expected_order(x) (x)\n\n#else\n\n#ifdef _MSC_VER\n#include <stdlib.h>\n#define bswap_32(x) _byteswap_ulong(x)\n#define bswap_64(x) _byteswap_uint64(x)\n\n#elif defined(__APPLE__)\n// Mac OS X / Darwin features\n#include <libkern/OSByteOrder.h>\n#define bswap_32(x) OSSwapInt32(x)\n#define bswap_64(x) OSSwapInt64(x)\n\n#else\n#include <byteswap.h>\n#endif\n\n#define uint32_in_expected_order(x) (bswap_32(x))\n#define uint64_in_expected_order(x) (bswap_64(x))\n\n#endif  // WORDS_BIGENDIAN\n\n#if !defined(LIKELY)\n#if HAVE_BUILTIN_EXPECT\n#define LIKELY(x) (__builtin_expect(!!(x), 1))\n#else\n#define LIKELY(x) (x)\n#endif\n#endif\n\nstatic uint64\nFetch64(const char* p)\n{\n  return uint64_in_expected_order(UNALIGNED_LOAD64(p));\n}\n\nstatic uint32\nFetch32(const char* p)\n{\n  return uint32_in_expected_order(UNALIGNED_LOAD32(p));\n}\n\n// Some primes between 2^63 and 2^64 for various uses.\nstatic const uint64 k0 = 0xc3a5c85c97cb3127ULL;\nstatic const uint64 k1 = 0xb492b66fbe98f273ULL;\nstatic const uint64 k2 = 0x9ae16a3b2f90404fULL;\nstatic const uint64 k3 = 0xc949d7c7509e6557ULL;\n\n// Hash 128 input bits down to 64 bits of output.\n// This is intended to be a reasonably good hash function.\nstatic inline uint64\nHash128to64(const uint128 x)\n{\n  // Murmur-inspired hashing.\n  const uint64 kMul = 0x9ddfea08eb382d69ULL;\n  uint64 a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul;\n  a ^= (a >> 47);\n  uint64 b = (Uint128High64(x) ^ a) * kMul;\n  b ^= (b >> 47);\n  b *= kMul;\n  return b;\n}\n\n// Bitwise right rotate.  Normally this will compile to a single\n// instruction, especially if the shift is a manifest constant.\nstatic uint64\nRotate(uint64 val, int shift)\n{\n  // Avoid shifting by 64: doing so yields an undefined result.\n  return shift == 0 ? val : ((val >> shift) | (val << (64 - shift)));\n}\n\n// Equivalent to Rotate(), but requires the second arg to be non-zero.\n// On x86-64, and probably others, it's possible for this to compile\n// to a single instruction if both args are already in registers.\nstatic uint64\nRotateByAtLeast1(uint64 val, int shift)\n{\n  return (val >> shift) | (val << (64 - shift));\n}\n\nstatic uint64\nShiftMix(uint64 val)\n{\n  return val ^ (val >> 47);\n}\n\nstatic uint64\nHashLen16(uint64 u, uint64 v)\n{\n  uint128 result;\n  result.first = u;\n  result.second = v;\n  return Hash128to64(result);\n}\n\nstatic uint64\nHashLen0to16(const char* s, size_t len)\n{\n  if (len > 8) {\n    uint64 a = Fetch64(s);\n    uint64 b = Fetch64(s + len - 8);\n    return HashLen16(a, RotateByAtLeast1(b + len, (int)len)) ^ b;\n  }\n  if (len >= 4) {\n    uint64 a = Fetch32(s);\n    return HashLen16(len + (a << 3), Fetch32(s + len - 4));\n  }\n  if (len > 0) {\n    uint8 a = (uint8)s[0];\n    uint8 b = (uint8)s[len >> 1];\n    uint8 c = (uint8)s[len - 1];\n    uint32 y = (uint32)(a) + ((uint32)(b) << 8);\n    uint32 z = (uint32)len + ((uint32)(c) << 2);\n    return ShiftMix(y * k2 ^ z * k3) * k2;\n  }\n  return k2;\n}\n\n// This probably works well for 16-byte strings as well, but it may be overkill\n// in that case.\nstatic uint64\nHashLen17to32(const char* s, size_t len)\n{\n  uint64 a = Fetch64(s) * k1;\n  uint64 b = Fetch64(s + 8);\n  uint64 c = Fetch64(s + len - 8) * k2;\n  uint64 d = Fetch64(s + len - 16) * k0;\n  return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d,\n                   a + Rotate(b ^ k3, 20) - c + len);\n}\n\n// Return a 16-byte hash for 48 bytes.  Quick and dirty.\n// Callers do best to use \"random-looking\" values for a and b.\n// static pair<uint64, uint64> WeakHashLen32WithSeeds(\nuint128\nWeakHashLen32WithSeeds6(uint64 w, uint64 x, uint64 y, uint64 z, uint64 a,\n                        uint64 b)\n{\n  a += w;\n  b = Rotate(b + a + z, 21);\n  uint64 c = a;\n  a += x;\n  a += y;\n  b += Rotate(a, 44);\n\n  uint128 result;\n  result.first = (uint64)(a + z);\n  result.second = (uint64)(b + c);\n  return result;\n}\n\n// Return a 16-byte hash for s[0] ... s[31], a, and b.  Quick and dirty.\n// static pair<uint64, uint64> WeakHashLen32WithSeeds(\nuint128\nWeakHashLen32WithSeeds(const char* s, uint64 a, uint64 b)\n{\n  return WeakHashLen32WithSeeds6(Fetch64(s), Fetch64(s + 8), Fetch64(s + 16),\n                                 Fetch64(s + 24), a, b);\n}\n\n// Return an 8-byte hash for 33 to 64 bytes.\nstatic uint64\nHashLen33to64(const char* s, size_t len)\n{\n  uint64 z = Fetch64(s + 24);\n  uint64 a = Fetch64(s) + (len + Fetch64(s + len - 16)) * k0;\n  uint64 b = Rotate(a + z, 52);\n  uint64 c = Rotate(a, 37);\n  a += Fetch64(s + 8);\n  c += Rotate(a, 7);\n  a += Fetch64(s + 16);\n  uint64 vf = a + z;\n  uint64 vs = b + Rotate(a, 31) + c;\n  a = Fetch64(s + 16) + Fetch64(s + len - 32);\n  z = Fetch64(s + len - 8);\n  b = Rotate(a + z, 52);\n  c = Rotate(a, 37);\n  a += Fetch64(s + len - 24);\n  c += Rotate(a, 7);\n  a += Fetch64(s + len - 16);\n  uint64 wf = a + z;\n  uint64 ws = b + Rotate(a, 31) + c;\n  uint64 r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0);\n  return ShiftMix(r * k0 + vs) * k2;\n}\n\nuint64\nCityHash64(const char* s, size_t len)\n{\n  if (len <= 32) {\n    if (len <= 16) {\n      return HashLen0to16(s, len);\n    } else {\n      return HashLen17to32(s, len);\n    }\n  } else if (len <= 64) {\n    return HashLen33to64(s, len);\n  }\n\n  // For strings over 64 bytes we hash the end first, and then as we\n  // loop we keep 56 bytes of state: v, w, x, y, and z.\n  uint64 x = Fetch64(s + len - 40);\n  uint64 y = Fetch64(s + len - 16) + Fetch64(s + len - 56);\n  uint64 z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24));\n  uint64 temp;\n  uint128 v = WeakHashLen32WithSeeds(s + len - 64, len, z);\n  uint128 w = WeakHashLen32WithSeeds(s + len - 32, y + k1, x);\n  x = x * k1 + Fetch64(s);\n\n  // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks.\n  len = (len - 1) & ~(size_t)(63);\n  do {\n    x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;\n    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;\n    x ^= w.second;\n    y += v.first + Fetch64(s + 40);\n    z = Rotate(z + w.first, 33) * k1;\n    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);\n    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));\n    temp = z;\n    z = x;\n    x = temp;\n    s += 64;\n    len -= 64;\n  } while (len != 0);\n  return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z,\n                   HashLen16(v.second, w.second) + x);\n}\n\nuint64\nCityHash64WithSeed(const char* s, size_t len, uint64 seed)\n{\n  return CityHash64WithSeeds(s, len, k2, seed);\n}\n\nuint64\nCityHash64WithSeeds(const char* s, size_t len, uint64 seed0, uint64 seed1)\n{\n  return HashLen16(CityHash64(s, len) - seed0, seed1);\n}\n\n// A subroutine for CityHash128().  Returns a decent 128-bit hash for strings\n// of any length representable in signed long.  Based on City and Murmur.\nstatic uint128\nCityMurmur(const char* s, size_t len, uint128 seed)\n{\n  uint64 a = Uint128Low64(seed);\n  uint64 b = Uint128High64(seed);\n  uint64 c = 0;\n  uint64 d = 0;\n  signed long l = (signed long)(len - 16);\n  if (l <= 0) {  // len <= 16\n    a = ShiftMix(a * k1) * k1;\n    c = b * k1 + HashLen0to16(s, len);\n    d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c));\n  } else {  // len > 16\n    c = HashLen16(Fetch64(s + len - 8) + k1, a);\n    d = HashLen16(b + len, c + Fetch64(s + len - 16));\n    a += d;\n    do {\n      a ^= ShiftMix(Fetch64(s) * k1) * k1;\n      a *= k1;\n      b ^= a;\n      c ^= ShiftMix(Fetch64(s + 8) * k1) * k1;\n      c *= k1;\n      d ^= c;\n      s += 16;\n      l -= 16;\n    } while (l > 0);\n  }\n  a = HashLen16(a, c);\n  b = HashLen16(d, b);\n\n  uint128 result;\n  result.first = (uint64)(a ^ b);\n  result.second = (uint64)(HashLen16(b, a));\n  return result;\n}\n\nuint128\nCityHash128WithSeed(const char* s, size_t len, uint128 seed)\n{\n  if (len < 128) {\n    return CityMurmur(s, len, seed);\n  }\n\n  // We expect len >= 128 to be the common case.  Keep 56 bytes of state:\n  // v, w, x, y, and z.\n  uint128 v, w;\n  uint64 x = Uint128Low64(seed);\n  uint64 y = Uint128High64(seed);\n  uint64 z = len * k1;\n  uint64 temp;\n  v.first = Rotate(y ^ k1, 49) * k1 + Fetch64(s);\n  v.second = Rotate(v.first, 42) * k1 + Fetch64(s + 8);\n  w.first = Rotate(y + z, 35) * k1 + x;\n  w.second = Rotate(x + Fetch64(s + 88), 53) * k1;\n\n  // This is the same inner loop as CityHash64(), manually unrolled.\n  do {\n    x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;\n    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;\n    x ^= w.second;\n    y += v.first + Fetch64(s + 40);\n    z = Rotate(z + w.first, 33) * k1;\n    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);\n    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));\n    temp = z;\n    z = x;\n    x = temp;\n    s += 64;\n    x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;\n    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;\n    x ^= w.second;\n    y += v.first + Fetch64(s + 40);\n    z = Rotate(z + w.first, 33) * k1;\n    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);\n    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));\n    temp = z;\n    z = x;\n    x = temp;\n    s += 64;\n    len -= 128;\n  } while (LIKELY(len >= 128));\n  x += Rotate(v.first + z, 49) * k0;\n  z += Rotate(w.first, 37) * k0;\n  // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s.\n  size_t tail_done;\n  for (tail_done = 0; tail_done < len;) {\n    tail_done += 32;\n    y = Rotate(x + y, 42) * k0 + v.second;\n    w.first += Fetch64(s + len - tail_done + 16);\n    x = x * k0 + w.first;\n    z += w.second + Fetch64(s + len - tail_done);\n    w.second += v.first;\n    v = WeakHashLen32WithSeeds(s + len - tail_done, v.first + z, v.second);\n  }\n  // At this point our 56 bytes of state should contain more than\n  // enough information for a strong 128-bit hash.  We use two\n  // different 56-byte-to-8-byte hashes to get a 16-byte final result.\n  x = HashLen16(x, v.first);\n  y = HashLen16(y + z, w.first);\n\n  uint128 result;\n  result.first = (uint64)(HashLen16(x + v.second, w.second) + y);\n  result.second = (uint64)HashLen16(x + w.second, y + v.second);\n  return result;\n}\n\nuint128\nCityHash128(const char* s, size_t len)\n{\n  uint128 r;\n  if (len >= 16) {\n    r.first = (uint64)(Fetch64(s) ^ k3);\n    r.second = (uint64)(Fetch64(s + 8));\n\n    return CityHash128WithSeed(s + 16, len - 16, r);\n\n  } else if (len >= 8) {\n    r.first = (uint64)(Fetch64(s) ^ (len * k0));\n    r.second = (uint64)(Fetch64(s + len - 8) ^ k1);\n\n    return CityHash128WithSeed(NULL, 0, r);\n  } else {\n    r.first = (uint64)k0;\n    r.second = (uint64)k1;\n    return CityHash128WithSeed(s, len, r);\n  }\n}\n\n#ifdef __SSE4_2__\n#include <nmmintrin.h>\n#include \"citycrc.h\"\n\n// Requires len >= 240.\nstatic void\nCityHashCrc256Long(const char* s, size_t len, uint32 seed, uint64* result)\n{\n  uint64 a = Fetch64(s + 56) + k0;\n  uint64 b = Fetch64(s + 96) + k0;\n  uint64 c = result[0] = HashLen16(b, len);\n  uint64 d = result[1] = Fetch64(s + 120) * k0 + len;\n  uint64 e = Fetch64(s + 184) + seed;\n  uint64 f = seed;\n  uint64 g = 0;\n  uint64 h = 0;\n  uint64 i = 0;\n  uint64 j = 0;\n  uint64 t = c + d;\n\n  // 240 bytes of input per iter.\n  size_t iters = len / 240;\n  len -= iters * 240;\n  do {\n#define CHUNK(multiplier, z)                              \\\n  {                                                       \\\n    uint64 old_a = a;                                     \\\n    a = Rotate(b, 41 ^ z) * multiplier + Fetch64(s);      \\\n    b = Rotate(c, 27 ^ z) * multiplier + Fetch64(s + 8);  \\\n    c = Rotate(d, 41 ^ z) * multiplier + Fetch64(s + 16); \\\n    d = Rotate(e, 33 ^ z) * multiplier + Fetch64(s + 24); \\\n    e = Rotate(t, 25 ^ z) * multiplier + Fetch64(s + 32); \\\n    t = old_a;                                            \\\n  }                                                       \\\n  f = _mm_crc32_u64(f, a);                                \\\n  g = _mm_crc32_u64(g, b);                                \\\n  h = _mm_crc32_u64(h, c);                                \\\n  i = _mm_crc32_u64(i, d);                                \\\n  j = _mm_crc32_u64(j, e);                                \\\n  s += 40\n\n    CHUNK(1, 1);\n    CHUNK(k0, 0);\n    CHUNK(1, 1);\n    CHUNK(k0, 0);\n    CHUNK(1, 1);\n    CHUNK(k0, 0);\n  } while (--iters > 0);\n\n  while (len >= 40) {\n    CHUNK(k0, 0);\n    len -= 40;\n  }\n  if (len > 0) {\n    s = s + len - 40;\n    CHUNK(k0, 0);\n  }\n  j += i << 32;\n  a = HashLen16(a, j);\n  h += g << 32;\n  b += h;\n  c = HashLen16(c, f) + i;\n  d = HashLen16(d, e + result[0]);\n  j += e;\n  i += HashLen16(h, t);\n  e = HashLen16(a, d) + j;\n  f = HashLen16(b, c) + a;\n  g = HashLen16(j, i) + c;\n  result[0] = e + f + g + h;\n  a = ShiftMix((a + g) * k0) * k0 + b;\n  result[1] += a + result[0];\n  a = ShiftMix(a * k0) * k0 + c;\n  result[2] = a + result[1];\n  a = ShiftMix((a + e) * k0) * k0;\n  result[3] = a + result[2];\n}\n\n// Requires len < 240.\nstatic void\nCityHashCrc256Short(const char* s, size_t len, uint64* result)\n{\n  char buf[240];\n  memcpy(buf, s, len);\n  memset(buf + len, 0, 240 - len);\n  CityHashCrc256Long(buf, 240, ~(uint32)(len), result);\n}\n\nvoid\nCityHashCrc256(const char* s, size_t len, uint64* result)\n{\n  if (LIKELY(len >= 240)) {\n    CityHashCrc256Long(s, len, 0, result);\n  } else {\n    CityHashCrc256Short(s, len, result);\n  }\n}\n\nuint128\nCityHashCrc128WithSeed(const char* s, size_t len, uint128 seed)\n{\n  if (len <= 900) {\n    return CityHash128WithSeed(s, len, seed);\n  } else {\n    uint64 result[4];\n    CityHashCrc256(s, len, result);\n    uint64 u = Uint128High64(seed) + result[0];\n    uint64 v = Uint128Low64(seed) + result[1];\n    uint128 crc;\n    crc.first = (uint64)(HashLen16(u, v + result[2]));\n    crc.second = (uint64)(HashLen16(Rotate(v, 32), u * k0 + result[3]));\n    return crc;\n  }\n}\n\nuint128\nCityHashCrc128(const char* s, size_t len)\n{\n  if (len <= 900) {\n    return CityHash128(s, len);\n  } else {\n    uint64 result[4];\n    CityHashCrc256(s, len, result);\n    uint128 crc;\n    crc.first = (uint64)result[2];\n    crc.second = (uint64)result[3];\n    return crc;\n  }\n}\n\n#endif\n"
  },
  {
    "path": "src/mica-herd/herd.c",
    "content": "#include \"hrd.h\"\n\n/* Every thread creates a TCP connection to the registry only once. */\n__thread memcached_st* memc = NULL;\n\n/*\n * Finds the port with rank `port_index` (0-based) in the list of ENABLED ports.\n * Fills its device id and device-local port id (1-based) into the supplied\n * control block.\n */\n\nchar dev_name[50];\nstruct ibv_device*\nhrd_resolve_port_index(struct hrd_ud_ctrl_blk* cb, int port_index)\n{\n  struct ibv_device** dev_list;\n  int num_devices = 0;\n\n  assert(port_index >= 0);\n\n  cb->device_id = -1;\n  cb->dev_port_id = -1;\n\n  dev_list = ibv_get_device_list(&num_devices);\n  CPE(!dev_list, \"HRD: Failed to get IB devices list\", 0);\n\n  int ports_to_discover = port_index;\n  int dev_i;\n\n  for (dev_i = 0; dev_i < num_devices; dev_i++) {\n    if (strcmp(dev_list[dev_i]->name, dev_name) != 0) continue;\n\n    struct ibv_context* ctx = ibv_open_device(dev_list[dev_i]);\n    CPE(!ctx, \"HRD: Couldn't open device\", 0);\n\n    struct ibv_device_attr device_attr;\n    memset(&device_attr, 0, sizeof(device_attr));\n    if (ibv_query_device(ctx, &device_attr)) {\n      printf(\"HRD: Could not query device: %d\\n\", dev_i);\n      exit(-1);\n    }\n\n    uint8_t port_i;\n    for (port_i = 1; port_i <= device_attr.phys_port_cnt; port_i++) {\n      /* Count this port only if it is enabled */\n      struct ibv_port_attr port_attr;\n      if (ibv_query_port(ctx, port_i, &port_attr) != 0) {\n        printf(\"HRD: Could not query port %d of device %d\\n\", port_i, dev_i);\n        exit(-1);\n      }\n\n      if (port_attr.phys_state != IBV_PORT_ACTIVE &&\n          port_attr.phys_state != IBV_PORT_ACTIVE_DEFER) {\n#ifndef __cplusplus\n        printf(\"HRD: Ignoring port %d of device %d. State is %s\\n\", port_i,\n               dev_i, ibv_port_state_str(port_attr.phys_state));\n#else\n        printf(\"HRD: Ignoring port %d of device %d. State is %s\\n\", port_i,\n               dev_i, ibv_port_state_str((ibv_port_state)port_attr.phys_state));\n#endif\n        continue;\n      }\n\n      if (ports_to_discover == 0) {\n        // printf(\"HRD: port index %d resolved to device %d, port %d\\n\",\n        // \tport_index, dev_i, port_i);\n\n        /* Fill the device ID and device-local port ID */\n        cb->device_id = dev_i;\n        cb->dev_port_id = port_i;\n\n        if (ibv_close_device(ctx)) {\n          fprintf(stderr, \"HRD: Couldn't release context\\n\");\n          assert(false);\n        }\n\n        return dev_list[cb->device_id];\n      }\n\n      ports_to_discover--;\n    }\n\n    if (ibv_close_device(ctx)) {\n      fprintf(stderr, \"HRD: Couldn't release context\\n\");\n      assert(false);\n    }\n  }\n\n  /* If we come here, port resolution failed */\n  assert(cb->device_id == -1 && cb->dev_port_id == -1);\n  fprintf(stderr, \"HRD: Invalid port index %d. Exiting.\\n\", port_index);\n  exit(-1);\n}\n\n/* Allocate SHM with @shm_key, and save the shmid into @shm_id_ret */\nvoid*\nhrd_malloc_socket(int shm_key, uint64_t size, int socket_id)\n{\n  int shmid;\n  int shm_flag =\n      IPC_CREAT | IPC_EXCL | 0666 | (USE_HUGE_PAGES == 1 ? SHM_HUGETLB : 0);\n\n  shmid = shmget(shm_key, size, shm_flag);\n\n  if (shmid == -1) {\n    switch (errno) {\n      case EACCES:\n        colored_printf(RED,\n                       \"HRD: SHM malloc error: Insufficient permissions.\"\n                       \" (SHM key = %d)\\n\",\n                       shm_key);\n        break;\n      case EEXIST:\n        colored_printf(RED,\n                       \"HRD: SHM malloc error: Already exists.\"\n                       \" (SHM key = %d)\\n\",\n                       shm_key);\n        break;\n      case EINVAL:\n        colored_printf(RED,\n                       \"HRD: SHM malloc error: SHMMAX/SHMIN mismatch.\"\n                       \" (SHM key = %d, size = %lu)\\n\",\n                       shm_key, size);\n        break;\n      case ENOMEM:\n        colored_printf(RED,\n                       \"HRD: SHM malloc error: Insufficient memory.\"\n                       \" (SHM key = %d, size = %lu)\\n\",\n                       shm_key, size);\n        break;\n      case ENOENT:\n        colored_printf(RED,\n                       \"HRD: SHM malloc error: No segment exists for the given \"\n                       \"key, and IPC_CREAT was not specified.\"\n                       \" (SHM key = %d, size = %lu)\\n\",\n                       shm_key, size);\n        break;\n      case ENOSPC:\n        colored_printf(\n            RED,\n            \"HRD: SHM malloc error: All possible shared memory IDs have been \"\n            \"taken or the limit of shared memory is exceeded.\"\n            \" (SHM key = %d, size = %lu)\\n\",\n            shm_key, size);\n        break;\n      case EPERM:\n        colored_printf(RED,\n                       \"HRD: SHM malloc error: The SHM_HUGETLB flag was \"\n                       \"specified, but the caller was not privileged\"\n                       \" (SHM key = %d, size = %lu)\\n\",\n                       shm_key, size);\n        break;\n      case ENFILE:\n        colored_printf(RED,\n                       \"HRD: SHM malloc error: The system-wide limit on the \"\n                       \"total number of open files has been reached.\"\n                       \" (SHM key = %d, size = %lu)\\n\",\n                       shm_key, size);\n        break;\n      default:\n        colored_printf(RED, \"HRD: SHM malloc error: A wild SHM error: %s.\\n\",\n                       strerror(errno));\n        break;\n    }\n    assert(false);\n  }\n\n  void* buf = shmat(shmid, NULL, 0);\n  if (buf == NULL) {\n    printf(\"HRD: SHM malloc error: shmat() failed for key %d\\n\", shm_key);\n    exit(-1);\n  }\n\n  /* Bind the buffer to this socket */\n  const unsigned long nodemask = (1 << socket_id);\n  int ret = mbind(buf, size, MPOL_BIND, &nodemask, 32, 0);\n  if (ret != 0) {\n    printf(\"HRD: SHM malloc error. mbind() failed for key %d\\n\", shm_key);\n    exit(-1);\n  }\n\n  // vasilis- try to take advantage of TLB coalescing, if it is there\n  if (LEVERAGE_TLB_COALESCING) {\n    uint64_t page_no = CEILING(size, HUGE_PAGE_SIZE) - 1;\n    for (uint64_t i = 0; i < page_no; i++) {\n      uint8_t* buf_ptr = ((uint8_t*)buf) + (i * HUGE_PAGE_SIZE);\n      memset(buf_ptr, 0, 1);\n    }\n  }\n\n  return buf;\n}\n\n/* Free shm @shm_key and @shm_buf. Return 0 on success, else -1. */\nint\nhrd_free(int shm_key, void* shm_buf)\n{\n  int ret;\n  int shmid = shmget(shm_key, 0, 0);\n  if (shmid == -1) {\n    switch (errno) {\n      case EACCES:\n        printf(\n            \"HRD: SHM free error: Insufficient permissions.\"\n            \" (SHM key = %d)\\n\",\n            shm_key);\n        break;\n      case ENOENT:\n        printf(\"HRD: SHM free error: No such SHM key. (SHM key = %d)\\n\",\n               shm_key);\n        break;\n      default:\n        printf(\"HRD: SHM free error: A wild SHM error: %s\\n\", strerror(errno));\n        break;\n    }\n    return -1;\n  }\n\n  ret = shmctl(shmid, IPC_RMID, NULL);\n  if (ret != 0) {\n    printf(\"HRD: SHM free error: shmctl() failed for key %d\\n\", shm_key);\n    exit(-1);\n  }\n\n  ret = shmdt(shm_buf);\n  if (ret != 0) {\n    printf(\"HRD: SHM free error: shmdt() failed for key %d\\n\", shm_key);\n    exit(-1);\n  }\n\n  return 0;\n}\n\n/* Get the LID of a port on the device specified by @ctx */\nuint16_t\nhrd_get_local_lid(struct ibv_context* ctx, int dev_port_id)\n{\n  assert(ctx != NULL && dev_port_id >= 1);\n\n  struct ibv_port_attr attr;\n  if (ibv_query_port(ctx, dev_port_id, &attr)) {\n    printf(\"HRD: ibv_query_port on port %d of device %s failed! Exiting.\\n\",\n           dev_port_id, ibv_get_device_name(ctx->device));\n    assert(false);\n  }\n\n  return attr.lid;\n}\n\n/* Return the environment variable @name if it is set. Exit if not. */\nchar*\nhrd_getenv(const char* name)\n{\n  char* env = getenv(name);\n  if (env == NULL) {\n    fprintf(stderr, \"Environment variable %s not set\\n\", name);\n    assert(false);\n  }\n\n  return env;\n}\n\nmemcached_st*\nhrd_create_memc()\n{\n  memcached_server_st* servers = NULL;\n  memcached_st* memc = memcached_create(NULL);\n  memcached_return rc;\n  memc = memcached_create(NULL);\n\n  char* registry_ip = hrd_getenv(\"HRD_REGISTRY_IP\");\n  //\tprintf(\"Appending server with IP: %s \\n\", registry_ip);\n  servers = memcached_server_list_append(servers, registry_ip,\n                                         MEMCACHED_DEFAULT_PORT, &rc);\n  // Pushes an array of memcached_server_st into the memcached_st structure.\n  // These servers will be placed at the end.\n  rc = memcached_server_push(memc, servers);\n  CPE(rc != MEMCACHED_SUCCESS, \"Couldn't add memcached server.\\n\", -1);\n\n  return memc;\n}\n\n/*\n * Insert key -> value mapping into memcached running at HRD_REGISTRY_IP.\n */\nvoid\nhrd_publish(const char* key, void* value, int len)\n{\n  assert(key != NULL && value != NULL && len > 0);\n  memcached_return rc;\n\n  if (memc == NULL) {\n    memc = hrd_create_memc();\n  }\n\n  rc = memcached_set(memc, key, strlen(key), (const char*)value, len, (time_t)0,\n                     (uint32_t)0);\n  if (rc != MEMCACHED_SUCCESS) {\n    char* registry_ip = hrd_getenv(\"HRD_REGISTRY_IP\");\n    fprintf(stderr,\n            \"\\tHRD: Failed to publish key %s to memcached. Error %s. \"\n            \"Reg IP = %s\\n\",\n            key, memcached_strerror(memc, rc), registry_ip);\n    exit(-1);\n  }\n}\n\n/*\n * Get the value associated with \"key\" into \"value\", and return the length\n * of the value. If the key is not found, return NULL and len -1. For all\n * other errors, terminate.\n *\n * This function sometimes gets called in a polling loop - ensure that there\n * are no memory leaks or unterminated memcached connections! We don't need\n * to free() the resul of getenv() since it points to a string in the process\n * environment.\n */\nint\nhrd_get_published(const char* key, void** value)\n{\n  assert(key != NULL);\n  if (memc == NULL) {\n    memc = hrd_create_memc();\n  }\n\n  memcached_return rc;\n  size_t value_length;\n  uint32_t flags;\n\n  *value = memcached_get(memc, key, strlen(key), &value_length, &flags, &rc);\n\n  if (rc == MEMCACHED_SUCCESS) {\n    return (int)value_length;\n  } else if (rc == MEMCACHED_NOTFOUND) {\n    assert(*value == NULL);\n    return -1;\n  } else {\n    char* registry_ip = hrd_getenv(\"HRD_REGISTRY_IP\");\n    // char *registry_ip = is_client == 1 ? remote_IP : local_IP;\n    fprintf(stderr,\n            \"HRD: Error finding value for key \\\"%s\\\": %s. \"\n            \"Reg IP = %s\\n\",\n            key, memcached_strerror(memc, rc), registry_ip);\n    exit(-1);\n  }\n\n  /* Never reached */\n  assert(false);\n}\n\n/*\n * If @prealloc_conn_buf != NULL, @conn_buf_size is the size of the preallocated\n * buffer. If @prealloc_conn_buf == NULL, @conn_buf_size is the size of the\n * new buffer to create.\n */\nstruct hrd_ud_ctrl_blk*\nhrd_ud_ctrl_blk_init(int local_hid, int port_index,\n                     int numa_node_id, /* -1 means don't use hugepages */\n                     int num_dgram_qps, int dgram_buf_size,\n                     int dgram_buf_shm_key, int* recv_q_depth,\n                     int* send_q_depth)\n{\n  // colored_printf(RED,\"HRD: creating control block %d: port %d, socket %d, \"\n  // \t\"conn qps %d, UC %d, conn buf %d bytes (key %d), \"\n  // \t\"dgram qps %d, dgram buf %d bytes (key %d)\\n\",\n  // \tlocal_hid, port_index, numa_node_id,\n  // \tnum_conn_qps, use_uc, conn_buf_size, conn_buf_shm_key,\n  // \tnum_dgram_qps, dgram_buf_size, dgram_buf_shm_key);\n\n  /*\n   * Check arguments for sanity.\n   * @local_hid can be anything: it's just control block identifier that is\n   * useful in printing debug info.\n   */\n  assert(port_index >= 0 && port_index <= 16);\n  assert(numa_node_id >= -1 && numa_node_id <= 8);\n  assert(num_dgram_qps >= 0 && num_dgram_qps <= M_2);\n  assert(dgram_buf_size >= 0 && dgram_buf_size <= M_1024);\n\n  if (num_dgram_qps == 0) {\n    colored_printf(RED,\n                   \"HRD: Control block initialization without QPs. Are you\"\n                   \" sure you want to do this?\\n\");\n    assert(false);\n  }\n\n  struct hrd_ud_ctrl_blk* cb =\n      (struct hrd_ud_ctrl_blk*)malloc(sizeof(struct hrd_ud_ctrl_blk));\n  memset(cb, 0, sizeof(struct hrd_ud_ctrl_blk));\n\n  /* Fill in the control block */\n  cb->local_hid = local_hid;\n  cb->numa_node_id = numa_node_id;\n\n  cb->num_dgram_qps = num_dgram_qps;\n  cb->dgram_buf_shm_key = dgram_buf_shm_key;\n\n  cb->recv_q_depth = recv_q_depth;\n  cb->send_q_depth = send_q_depth;\n\n  /* Get the device to use. This fills in cb->device_id and cb->dev_port_id */\n  struct ibv_device* ib_dev = hrd_resolve_port_index(cb, port_index);\n  CPE(!ib_dev, \"HRD: IB device not found\", 0);\n\n  /* Use a single device context and PD for all QPs */\n  cb->ctx = ibv_open_device(ib_dev);\n  CPE(!cb->ctx, \"HRD: Couldn't get context\", 0);\n\n  cb->pd = ibv_alloc_pd(cb->ctx);\n  CPE(!cb->pd, \"HRD: Couldn't allocate PD\", 0);\n\n  int ib_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ |\n                 IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC;\n\n  /*\n   * Create datagram QPs and transition them RTS.\n   * Create and register datagram RDMA buffer.\n   */\n  if (num_dgram_qps >= 1) {\n    cb->dgram_qp =\n        (struct ibv_qp**)malloc(num_dgram_qps * sizeof(struct ibv_qp*));\n    cb->dgram_send_cq =\n        (struct ibv_cq**)malloc(num_dgram_qps * sizeof(struct ibv_cq*));\n    cb->dgram_recv_cq =\n        (struct ibv_cq**)malloc(num_dgram_qps * sizeof(struct ibv_cq*));\n\n    assert(cb->dgram_qp != NULL && cb->dgram_send_cq != NULL &&\n           cb->dgram_recv_cq != NULL);\n\n    hrd_create_dgram_qps(cb);\n\n    /* Create and register dgram_buf */\n    int reg_size = 0;\n\n    if (numa_node_id >= 0) {\n      /* Hugepages */\n      while (reg_size < dgram_buf_size) {\n        reg_size += M_2;\n      }\n\n      /* SHM key 0 is hard to free later */\n      assert(dgram_buf_shm_key >= 1 && dgram_buf_shm_key <= 128);\n      cb->dgram_buf = (volatile uint8_t*)hrd_malloc_socket(\n          dgram_buf_shm_key, reg_size, numa_node_id);\n    } else {\n      reg_size = dgram_buf_size;\n      cb->dgram_buf = (volatile uint8_t*)memalign(4096, reg_size);\n    }\n\n    assert(cb->dgram_buf != NULL);\n    memset((char*)cb->dgram_buf, 0, reg_size);\n\n    cb->dgram_buf_mr =\n        ibv_reg_mr(cb->pd, (char*)cb->dgram_buf, reg_size, ib_flags);\n    assert(cb->dgram_buf_mr != NULL);\n  }\n\n  return cb;\n}\n\n/* Free up the resources taken by @cb. Return -1 if something fails, else 0. */\nint\nhrd_ud_ctrl_blk_destroy(struct hrd_ud_ctrl_blk* cb)\n{\n  int i;\n  colored_printf(RED, \"HRD: Destroying control block %d\\n\", cb->local_hid);\n\n  /* Destroy QPs and CQs. QPs must be destroyed before CQs. */\n  for (i = 0; i < cb->num_dgram_qps; i++) {\n    assert(cb->dgram_send_cq[i] != NULL && cb->dgram_recv_cq[i] != NULL);\n    assert(cb->dgram_qp[i] != NULL);\n\n    if (ibv_destroy_qp(cb->dgram_qp[i])) {\n      fprintf(stderr, \"HRD: Couldn't destroy dgram QP %d\\n\", i);\n      return -1;\n    } else\n      assert(0);\n  }\n\n  /* Destroy memory regions */\n  if (cb->num_dgram_qps > 0) {\n    assert(cb->dgram_buf_mr != NULL && cb->dgram_buf != NULL);\n    if (ibv_dereg_mr(cb->dgram_buf_mr)) {\n      fprintf(stderr, \"HRD: Couldn't deregister dgram MR for cb %d\\n\",\n              cb->local_hid);\n      return -1;\n    }\n\n    if (cb->numa_node_id >= 0) {\n      if (hrd_free(cb->dgram_buf_shm_key, (void*)cb->dgram_buf)) {\n        fprintf(stderr, \"HRD: Error freeing dgram hugepages for cb %d\\n\",\n                cb->local_hid);\n      }\n    } else {\n      free((void*)cb->dgram_buf);\n    }\n  }\n\n  /* Destroy protection domain */\n  if (ibv_dealloc_pd(cb->pd)) {\n    fprintf(stderr, \"HRD: Couldn't dealloc PD for cb %d\\n\", cb->local_hid);\n    return -1;\n  }\n\n  /* Destroy device context */\n  if (ibv_close_device(cb->ctx)) {\n    fprintf(stderr, \"Couldn't release context for cb %d\\n\", cb->local_hid);\n    return -1;\n  }\n\n  colored_printf(RED, \"HRD: Control block %d destroyed.\\n\", cb->local_hid);\n  return 0;\n}\n\n/* Create datagram QPs and transition them to RTS */\nvoid\nhrd_create_dgram_qps(struct hrd_ud_ctrl_blk* cb)\n{\n  int i;\n  assert(cb->dgram_qp != NULL && cb->dgram_send_cq != NULL &&\n         cb->dgram_recv_cq != NULL && cb->pd != NULL && cb->ctx != NULL);\n  assert(cb->num_dgram_qps >= 1 && cb->dev_port_id >= 1);\n\n  for (i = 0; i < cb->num_dgram_qps; i++) {\n    cb->dgram_send_cq[i] =\n        ibv_create_cq(cb->ctx, cb->send_q_depth[i], NULL, NULL, 0);\n    assert(cb->dgram_send_cq[i] != NULL);\n\n    // <vasilis> I am replacing the recv_cq Depth\n    // cb->dgram_recv_cq[i] = ibv_create_cq(cb->ctx,\n    // \tHRD_Q_DEPTH, NULL, NULL, 0);\n    cb->dgram_recv_cq[i] =\n        ibv_create_cq(cb->ctx, cb->recv_q_depth[i], NULL, NULL, 0);\n    assert(cb->dgram_recv_cq[i] != NULL);\n    // </vasilis>\n\n    /* Initialize creation attributes */\n    struct ibv_qp_init_attr create_attr;\n    memset((void*)&create_attr, 0, sizeof(struct ibv_qp_init_attr));\n    // if (i > 0) printf(\"The recv queue %d has size %d, the send queue has size\n    // \t\t%d\\n\", i, cb->recv_q_depth[i], cb->send_q_depth[i] );\n    create_attr.send_cq = cb->dgram_send_cq[i];\n    create_attr.recv_cq = cb->dgram_recv_cq[i];\n    create_attr.qp_type = IBV_QPT_UD;\n\n    create_attr.cap.max_send_wr = cb->send_q_depth[i];\n    // <vasilis>\n    // printf(\"Receive q depth %d\\n\", cb->recv_q_depth);\n    create_attr.cap.max_recv_wr = cb->recv_q_depth[i];\n    // </vasilis>\n    create_attr.cap.max_send_sge = 1;\n    create_attr.cap.max_recv_sge = 1;\n    create_attr.cap.max_inline_data = HRD_MAX_INLINE;\n\n    cb->dgram_qp[i] = ibv_create_qp(cb->pd, &create_attr);\n    assert(cb->dgram_qp[i] != NULL);\n\n    /* INIT state */\n    struct ibv_qp_attr init_attr;\n    memset((void*)&init_attr, 0, sizeof(struct ibv_qp_attr));\n    init_attr.qp_state = IBV_QPS_INIT;\n    init_attr.pkey_index = 0;\n    init_attr.port_num = cb->dev_port_id;\n    init_attr.qkey = HRD_DEFAULT_QKEY;\n\n    if (ibv_modify_qp(\n            cb->dgram_qp[i], &init_attr,\n            IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY)) {\n      fprintf(stderr, \"Failed to modify dgram QP to INIT\\n\");\n      return;\n    }\n\n    /* RTR state */\n    struct ibv_qp_attr rtr_attr;\n    memset((void*)&rtr_attr, 0, sizeof(struct ibv_qp_attr));\n    rtr_attr.qp_state = IBV_QPS_RTR;\n\n    if (ibv_modify_qp(cb->dgram_qp[i], &rtr_attr, IBV_QP_STATE)) {\n      fprintf(stderr, \"Failed to modify dgram QP to RTR\\n\");\n      exit(-1);\n    }\n\n    /* Reuse rtr_attr for RTS */\n    rtr_attr.qp_state = IBV_QPS_RTS;\n    rtr_attr.sq_psn = HRD_DEFAULT_PSN;\n\n    if (ibv_modify_qp(cb->dgram_qp[i], &rtr_attr,\n                      IBV_QP_STATE | IBV_QP_SQ_PSN)) {\n      fprintf(stderr, \"Failed to modify dgram QP to RTS\\n\");\n      exit(-1);\n    }\n  }\n}\n\nvoid\nhrd_publish_dgram_qp(struct hrd_ud_ctrl_blk* cb, int n, const char* qp_name,\n                     uint8_t sl)\n{\n  assert(cb != NULL);\n  assert(n >= 0 && n < cb->num_dgram_qps);\n\n  assert(qp_name != NULL && strlen(qp_name) < HRD_QP_NAME_SIZE - 1);\n  assert(strstr(qp_name, HRD_RESERVED_NAME_PREFIX) == NULL);\n\n  int len = strlen(qp_name);\n  int i;\n  for (i = 0; i < len; i++) {\n    if (qp_name[i] == ' ') {\n      fprintf(stderr, \"HRD: Space not allowed in QP name\\n\");\n      exit(-1);\n    }\n  }\n\n  struct hrd_qp_attr qp_attr;\n  memcpy(qp_attr.name, qp_name, len);\n  qp_attr.name[len] = 0; /* Add the null terminator */\n  qp_attr.lid = hrd_get_local_lid(cb->dgram_qp[n]->context, cb->dev_port_id);\n  qp_attr.qpn = cb->dgram_qp[n]->qp_num;\n  qp_attr.sl = sl;\n\n  // <Vasilis>  ---ROCE----------\n  if (is_roce == 1) {\n    union ibv_gid ret_gid;\n    ibv_query_gid(cb->ctx, IB_PHYS_PORT, 0, &ret_gid);\n    qp_attr.gid_global_interface_id = ret_gid.global.interface_id;\n    qp_attr.gid_global_subnet_prefix = ret_gid.global.subnet_prefix;\n  }\n  // printf(\"Publishing datagram qp with name %s \\n\", qp_attr.name);\n  // </vasilis>\n\n  hrd_publish(qp_attr.name, &qp_attr, sizeof(struct hrd_qp_attr));\n}\n\nstruct hrd_qp_attr*\nhrd_get_published_qp(const char* qp_name)\n{\n  struct hrd_qp_attr* ret;\n  assert(qp_name != NULL && strlen(qp_name) < HRD_QP_NAME_SIZE - 1);\n  assert(strstr(qp_name, HRD_RESERVED_NAME_PREFIX) == NULL);\n\n  int len = strlen(qp_name);\n  int i;\n  for (i = 0; i < len; i++) {\n    if (qp_name[i] == ' ') {\n      fprintf(stderr, \"HRD: Space not allowed in QP name\\n\");\n      exit(-1);\n    }\n  }\n\n  int ret_len = hrd_get_published(qp_name, (void**)&ret);\n\n  /*\n   * The registry lookup returns only if we get a unique QP for @qp_name, or\n   * if the memcached lookup succeeds but we don't have an entry for @qp_name.\n   */\n  assert(ret_len == sizeof(struct hrd_qp_attr) || ret_len == -1);\n\n  return ret;\n}\n\n//////////////////////////\n/// Fun-c-print\n//////////////////////////\n\n/* Like printf, but colorfur. Limited to 1000 characters. */\nvoid\ncolored_printf(color_print_t color, const char* format, ...)\n{\n#define RED_LIM 1000\n  va_list args;\n  int i;\n\n  char buf1[RED_LIM], buf2[RED_LIM];\n  memset(buf1, 0, RED_LIM);\n  memset(buf2, 0, RED_LIM);\n\n  va_start(args, format);\n\n  /* Marshal the stuff to print in a buffer */\n  vsnprintf(buf1, RED_LIM, format, args);\n\n  /* Probably a bad check for buffer overflow */\n  for (i = RED_LIM - 1; i >= RED_LIM - 50; i--) {\n    assert(buf1[i] == 0);\n  }\n\n  /* Add markers for red color and reset color */\n  // snprintf(buf2, 1000, \"\\033[31m%s\\033[0m\", buf1);\n  snprintf(buf2, 1000, \"\\033[31m%s\\033[0m\", buf1);\n  switch (color) {\n    case YELLOW:\n      snprintf(buf2, 1000, \"\\033[33m%s\\033[0m\", buf1);\n      break;\n    case RED:\n      snprintf(buf2, 1000, \"\\033[31m%s\\033[0m\", buf1);\n      break;\n    case GREEN:\n      snprintf(buf2, 1000, \"\\033[32m%s\\033[0m\", buf1);\n      break;\n    case CYAN:\n      snprintf(buf2, 1000, \"\\033[36m%s\\033[0m\", buf1);\n      break;\n    default:\n      printf(\"Wrong printf color /%d \\n\", color);\n      assert(false);\n  }\n\n  /* Probably another bad check for buffer overflow */\n  for (i = RED_LIM - 1; i >= RED_LIM - 50; i--) {\n    assert(buf2[i] == 0);\n  }\n\n  printf(\"%s\", buf2);\n\n  va_end(args);\n}\n"
  },
  {
    "path": "src/mica-herd/mica.c",
    "content": "#include \"mica.h\"\n#include \"hrd.h\"\n\nint\nis_power_of_2(int x)\n{\n  return (x == 1 || x == 2 || x == 4 || x == 8 || x == 16 || x == 32 ||\n          x == 64 || x == 128 || x == 256 || x == 512 || x == 1024 ||\n          x == 2048 || x == 4096 || x == 8192 || x == 16384 || x == 32768 ||\n          x == 65536 || x == 131072 || x == 262144 || x == 524288 ||\n          x == 1048576 || x == 2097152 || x == 4194304 || x == 8388608 ||\n          x == 16777216 || x == 33554432 || x == 67108864 || x == 134217728 ||\n          x == 268435456 || x == 536870912 || x == 1073741824);\n}\n\nvoid\nmica_init(struct mica_kv* kv, int instance_id, int node_id, int num_bkts,\n          uint64_t log_cap)\n{\n  int i, j;\n\n  /* Verify struct sizes */\n  assert(sizeof(struct mica_slot) == 8);\n  assert(sizeof(struct mica_key) == 16);\n  assert(sizeof(struct mica_op) % 64 == 0);\n\n  assert(kv != NULL);\n  assert(node_id == 0 || node_id == 1);\n\n  /* 16 million buckets = a 1 GB index */\n  assert(is_power_of_2(num_bkts) == 1 && num_bkts <= M_128);\n  // assert(log_cap > 0 && log_cap <= M_1024 &&\n  //\tlog_cap % M_2 == 0 && is_power_of_2(log_cap));\n\n  assert(MICA_LOG_BITS >= 24); /* Minimum log size = 16 MB */\n\n  // red_printf(\"mica-herd-herd: Initializing MICA instance %d.\\n\"\n  // \t\"NUMA node = %d, buckets = %d (size = %u B), log capacity = %d B.\\n\",\n  // \tinstance_id,\n  // \tnode_id, num_bkts, num_bkts * sizeof(struct mica_bkt), log_cap);\n\n  if (MICA_DEBUG != 0) {\n    printf(\n        \"mica-herd-herd: Debug mode is ON! This might reduce performance.\\n\");\n    sleep(2);\n  }\n\n  /* Initialize metadata and stats */\n  kv->instance_id = instance_id;\n\n  kv->num_bkts = num_bkts;\n  kv->bkt_mask = num_bkts - 1; /* num_bkts is power of 2 */\n\n  kv->log_cap = log_cap;\n  kv->log_mask = log_cap - 1; /* log_cap is a power of 2 */\n  kv->log_head = 0;\n\n  kv->num_insert_op = 0;\n  kv->num_index_evictions = 0;\n\n  /* Alloc index and initialize all entries to invalid */\n  // printf(\"mica-herd-herd: Allocting hash table index for instance %d\\n\",\n  // instance_id);\n  int ht_index_key = MICA_INDEX_SHM_KEY + instance_id;\n  kv->ht_index = (struct mica_bkt*)hrd_malloc_socket(\n      ht_index_key, num_bkts * sizeof(struct mica_bkt), node_id);\n\n  for (i = 0; i < num_bkts; i++) {\n    for (j = 0; j < 8; j++) {\n      kv->ht_index[i].slots[j].in_use = 0;\n    }\n  }\n\n  /* Alloc log */\n  // printf(\"mica-herd-herd: Allocting hash table log for instance %d\\n\",\n  // instance_id);\n  int ht_log_key = MICA_LOG_SHM_KEY + instance_id;\n  kv->ht_log = (uint8_t*)hrd_malloc_socket(ht_log_key, log_cap, node_id);\n}\n\nvoid\nmica_insert_one(struct mica_kv* kv, struct mica_op* op, struct mica_resp* resp)\n{\n#if MICA_DEBUG == 1\n  assert(kv != NULL);\n  assert(op != NULL);\n  assert(op->opcode == MICA_OP_PUT);\n  assert(op->val_len > 0 && op->val_len <= MICA_MAX_VALUE);\n  assert(resp != NULL);\n#endif\n\n  int i;\n  unsigned int bkt = op->key.bkt & kv->bkt_mask;\n  struct mica_bkt* bkt_ptr = &kv->ht_index[bkt];\n  unsigned int tag = op->key.tag;\n\n#if MICA_DEBUG == 2\n  mica_print_op(op);\n#endif\n\n  kv->num_insert_op++;\n\n  /* Find a slot to use for this key. If there is a slot with the same\n   * tag as ours, we are sure to find it because the used slots are at\n   * the beginning of the 8-slot array. */\n  int slot_to_use = -1;\n  for (i = 0; i < 8; i++) {\n    if (bkt_ptr->slots[i].tag == tag || bkt_ptr->slots[i].in_use == 0) {\n      slot_to_use = i;\n    }\n  }\n\n  /* If no slot found, choose one to evict */\n  if (slot_to_use == -1) {\n    slot_to_use = tag & 7; /* tag is ~ randomly distributed */\n    kv->num_index_evictions++;\n  }\n\n  /* Encode the empty slot */\n  bkt_ptr->slots[slot_to_use].in_use = 1;\n  bkt_ptr->slots[slot_to_use].offset = kv->log_head; /* Virtual head */\n  bkt_ptr->slots[slot_to_use].tag = tag;\n\n  /* Paste the key-value into the log */\n  uint8_t* log_ptr = &kv->ht_log[kv->log_head & kv->log_mask];\n\n  /* Data copied: key, opcode, val_len, value */\n  int len_to_copy = sizeof(struct mica_key) + sizeof(uint8_t) +\n                    sizeof(uint8_t) + KVS_VALUE_SIZE;  /// op->val_len;\n\n  /* Ensure that we don't wrap around in the *virtual* log space even\n   * after 8-byte alignment below.*/\n  assert((1ULL << MICA_LOG_BITS) - kv->log_head > len_to_copy + 8);\n\n  memcpy(log_ptr, op, len_to_copy);\n  kv->log_head += len_to_copy;\n\n  /* Ensure that the key field of each log entry is 8-byte aligned. This\n   * makes subsequent comparisons during GETs faster. */\n  kv->log_head = (kv->log_head + 7) & ~7;\n\n  /* If we're close to overflowing in the physical log, wrap around to\n   * the beginning, but go forward in the virtual log. */\n  if (unlikely(kv->log_cap - kv->log_head <= MICA_MAX_VALUE + 32)) {\n    kv->log_head = (kv->log_head + kv->log_cap) & ~kv->log_mask;\n    colored_printf(\n        RED, \"mica-herd-herd: Instance %d wrapping around. Wraps = %llu\\n\",\n        kv->instance_id, kv->log_head / kv->log_cap);\n  }\n}\n\n/* A fast deterministic way to generate @n ~randomly distributed 16-byte keys */\nuint128*\nmica_gen_keys(int n)\n{\n  int i;\n  assert(n > 0 && n <= M_1024 / sizeof(uint128));\n  assert(sizeof(uint128) == 16);\n\n  // printf(\"mica-herd-herd: Generating %d keys\\n\", n);\n\n  uint128* key_arr = malloc(n * sizeof(uint128));\n  assert(key_arr != NULL);\n\n  for (i = 0; i < n; i++) {\n    key_arr[i] = CityHash128((char*)&i, 4);\n  }\n\n  return key_arr;\n}\n"
  },
  {
    "path": "src/wings/wings.c",
    "content": "//\n// Created by akatsarakis on 22/01/19.\n//\n\n#include \"../../include/wings/wings.h\"\n#include <config.h>\n#include <infiniband/verbs.h>\n#include <inline-util.h>\n#include <spacetime.h>\n#include <stdio.h>\n\n// implement a Multicast / Unicast channel\n// Support for:\n//      mulitcast / unicast channel\n//      Coalescing\n//      Variable size msgs?\n//      Selective Signaling\n//      Batching to the NIC\n//      Inlining or not\n//      Batch post receives to the NIC\n//          Mode 1: poll reqs, copy incoming msgs to local buffers and\n//          (p)re-post recvs Mode 2: poll reqs, do not copy msgs and post rcvs\n//          when said\n//      Enable implicit (request - response mode) and explicit (batched) credits\n//      flow control\n\nvoid _wings_setup_send_wr_and_sgl(ud_channel_t* ud_c);\nvoid _wings_setup_recv_wr_and_sgl(ud_channel_t* ud_c,\n                                  struct hrd_ud_ctrl_blk* cb);\nvoid _wings_setup_crd_wr_and_sgl(ud_channel_t* ud_c,\n                                 struct hrd_ud_ctrl_blk* cb);\nvoid _wings_setup_incoming_buff_and_post_initial_recvs(ud_channel_t* ud_c);\nvoid _wings_ud_channel_init_recv(ud_channel_t* ud_c, struct hrd_ud_ctrl_blk* cb,\n                                 uint8_t qp_id,\n                                 volatile uint8_t* incoming_reqs_ptr);\n\nvoid _wings_ud_channel_crd_init(ud_channel_t* ud_c, char* qp_name,\n                                ud_channel_t* linked_channel,\n                                uint16_t crds_per_channel,\n                                uint16_t num_channels, uint8_t channel_id,\n                                uint8_t enable_stats, uint8_t enable_prints);\n\nvoid _wings_print_on_off_toggle(uint16_t bin_flag, char* str);\n\nvoid _wings_share_qp_info_via_memcached(ud_channel_t** ud_c_array,\n                                        uint16_t ud_c_num,\n                                        dbit_vector_t* shared_rdy_var,\n                                        int worker_lid,\n                                        struct hrd_ud_ctrl_blk* cb);\n\nvoid\nwings_ud_channel_destroy(ud_channel_t* ud_c)\n{\n  free(ud_c->qp_name);\n  free(ud_c->recv_wc);\n  free(ud_c->remote_qps);\n  free(ud_c->credits_per_channels);\n\n  if (ud_c->send_pkt_buff != NULL) free(ud_c->send_pkt_buff);\n\n  if (ud_c->type != CRD && ud_c->max_coalescing > 1)\n    free(ud_c->overflow_msg_buff);\n\n  if (ud_c->type == CRD) free(ud_c->no_crds_to_send_per_endpoint);\n}\n\nvoid\nwings_ud_channel_init(ud_channel_t* ud_c, char* qp_name, enum channel_type type,\n                      uint8_t max_coalescing, uint16_t max_req_size,\n                      uint16_t small_req_size, uint8_t enable_inlining,\n                      uint8_t is_header_only,\n                      // Broadcast\n                      uint8_t is_bcast,\n                      // Credits\n                      uint8_t disable_crd_ctrl, uint8_t expl_crd_ctrl,\n                      ud_channel_t* linked_channel, uint16_t crds_per_channel,\n                      uint16_t num_channels, uint8_t channel_id,\n                      // Toggles\n                      uint8_t stats_on, uint8_t prints_on)\n{\n  assert(type != CRD);         // if CRD type then used the *_crd_init instead\n  assert(max_coalescing > 0);  // To disable coalescing use max_coalescing == 1\n  assert(channel_id < num_channels);\n  assert(!(disable_crd_ctrl == 1 &&\n           expl_crd_ctrl == 1));  // cannot disable crd_ctrl and then set an\n                                  // explicit credit control\n  assert(\n      disable_crd_ctrl == 1 ||\n      linked_channel !=\n          NULL);  // cannot disable crd_ctrl and then set an crd control channel\n  assert(is_bcast == 0 || is_header_only == 0);\n  assert(small_req_size <= max_req_size);\n\n  _wings_assert_binary(stats_on);\n  _wings_assert_binary(is_bcast);\n  _wings_assert_binary(prints_on);\n  _wings_assert_binary(expl_crd_ctrl);\n  _wings_assert_binary(is_header_only);\n  _wings_assert_binary(enable_inlining);\n\n  ud_c->is_header_only = is_header_only;\n  if (ud_c->is_header_only)\n    /// WARNING: hdr_only msgs have an additional 1st B indicating sender_id\n    /// (which must not be taken into account on max_req_size)\n    assert(max_req_size <= 3 * sizeof(uint8_t) && max_coalescing == 1);\n\n  ud_c->type = type;\n  ud_c->channel_id = channel_id;\n  ud_c->num_channels = num_channels;  // num_channels include our own channel\n  ud_c->expl_crd_ctrl = expl_crd_ctrl;\n  ud_c->disable_crd_ctrl = disable_crd_ctrl;\n  ud_c->is_bcast_channel = is_bcast;\n  ud_c->num_crds_per_channel = crds_per_channel;\n  ud_c->channel_providing_crds = linked_channel;\n\n  ud_c->qp_name =\n      malloc(sizeof(char) *\n             (strlen(qp_name) +\n              1));  // TODO make sure to destroy this when destroing a ud_c\n  strcpy(ud_c->qp_name, qp_name);\n\n  ud_c->enable_stats = stats_on;\n  ud_c->enable_prints = prints_on;\n\n  ud_c->max_coalescing = max_coalescing;\n  ud_c->max_msg_size =\n      (uint16_t)(max_req_size + (ud_c->is_header_only == 1\n                                     ? 1\n                                     : 0));  // hdr_only msgs have an additional\n                                             // 1st B indicating sender_id\n  ud_c->small_msg_size =\n      small_req_size == 0 ? ud_c->max_msg_size : small_req_size;\n\n  ud_c->no_crds_to_send_per_endpoint = NULL;  // unused for type != CRD\n\n  uint16_t remote_channels = (uint16_t)(num_channels - 1);\n  ud_c->is_inlining_enabled =\n      (uint8_t)(ud_c->is_header_only == 1 ? 1 : enable_inlining);\n  if (ud_c->is_header_only == 0 &&\n      _wings_ud_send_max_pkt_size(ud_c) > WINGS_MAX_SUPPORTED_INLINING) {\n    if (ud_c->is_inlining_enabled)\n      printf(\n          \"Unfortunately, inlining for msgs sizes up to (%d) \"\n          \"is higher than the supported (%d)\\n\",\n          _wings_ud_send_max_pkt_size(ud_c), WINGS_MAX_SUPPORTED_INLINING);\n    ud_c->is_inlining_enabled = 0;\n  }\n\n  ud_c->credits_per_channels = malloc(sizeof(uint16_t) * (num_channels));\n  for (int i = 0; i < num_channels; ++i)\n    ud_c->credits_per_channels[i] =\n        (uint16_t)(type == REQ && !disable_crd_ctrl ? crds_per_channel : 0);\n\n  ud_c->max_pcie_bcast_batch =\n      (uint16_t)WINGS_MIN(WINGS_MIN_PCIE_BCAST_BATCH + 1, crds_per_channel);\n  // Warning! use min to avoid resetting the first req prior batching to the NIC\n  // WARNING: todo check why we need to have MIN_PCIE_BCAST_BATCH + 1 instead of\n  // just MIN_PCIE_BCAST_BATCH\n  uint16_t max_msgs_in_pcie_bcast =\n      (uint16_t)(ud_c->max_pcie_bcast_batch *\n                 remote_channels);  // must be smaller than the q_depth\n\n  ud_c->max_recv_wrs = (uint16_t)(crds_per_channel * remote_channels);\n  ud_c->max_send_wrs =\n      (uint16_t)(ud_c->is_bcast_channel ? max_msgs_in_pcie_bcast\n                                        : crds_per_channel * remote_channels);\n\n  ud_c->ss_granularity =\n      ud_c->is_bcast_channel ? ud_c->max_pcie_bcast_batch : ud_c->max_send_wrs;\n\n  ud_c->recv_q_depth = ud_c->max_recv_wrs;\n  ud_c->send_q_depth =\n      (uint16_t)(2 * ud_c->ss_granularity *\n                 (ud_c->is_bcast_channel ? remote_channels : 1));\n\n  ud_c->recv_wc = malloc(sizeof(struct ibv_wc) * ud_c->max_recv_wrs);\n\n  ud_c->recv_pkt_buff_len = ud_c->max_recv_wrs;\n  ud_c->send_pkt_buff_len =\n      (uint16_t)(ud_c->max_send_wrs * (ud_c->is_inlining_enabled ? 1 : 2));\n\n  ud_c->send_pkt_buff =\n      ud_c->is_header_only == 1\n          ? NULL\n          : malloc(_wings_ud_send_max_pkt_size(ud_c) * ud_c->send_pkt_buff_len);\n\n  ud_c->overflow_msg_buff = NULL;\n  // Overflow on polling\n  if (ud_c->max_coalescing > 1) {\n    ud_c->num_overflow_msgs = 0;\n    ud_c->enable_overflow_msgs = 1;\n    ud_c->overflow_msg_buff =\n        malloc((size_t)(ud_c->max_msg_size * (ud_c->max_coalescing - 1)));\n  } else {\n    ud_c->num_overflow_msgs = 0;\n    ud_c->enable_overflow_msgs = 0;\n    ud_c->overflow_msg_buff = NULL;\n  }\n\n  ud_c->send_push_ptr = 0;\n  ud_c->recv_push_ptr = 0;\n  ud_c->recv_pull_ptr = -1;\n\n  ud_c->total_pkts_send = 0;\n\n  ud_c->stats.ss_completions = 0;\n  ud_c->stats.recv_total_pkts = 0;\n  ud_c->stats.recv_total_msgs = 0;\n  ud_c->stats.send_total_msgs = 0;\n  ud_c->stats.send_total_pkts = 0;\n  ud_c->stats.send_total_pcie_batches = 0;\n  ud_c->stats.no_stalls_due_to_credits = 0;\n\n  // Initialize the crd channel as well\n  if (ud_c->expl_crd_ctrl) {\n    char crd_qp_name[1000];\n    sprintf(crd_qp_name, \"\\033[1m\\033[36mCRD\\033[0m-%s\", qp_name);\n    _wings_ud_channel_crd_init(linked_channel, crd_qp_name, ud_c,\n                               crds_per_channel, num_channels, channel_id,\n                               stats_on, prints_on);\n  }\n\n  ud_c->remote_qps = malloc(sizeof(qp_info_t) * ud_c->num_channels);\n\n  // The following are set by the *_init_recv function after the creation of\n  // control block and QPs\n  ud_c->qp = NULL;\n  ud_c->pd = NULL;\n  ud_c->qp_id = 0;\n  ud_c->send_cq = NULL;  // set by init_recv\n  ud_c->recv_cq = NULL;  // set by init_recv\n  ud_c->recv_pkt_buff = NULL;\n  ud_c->send_mem_region = NULL;  // set by init_recv\n  //\t_wings_setup_send_wr_and_sgl(ud_c);\n  //\t_wings_setup_recv_wr_and_sgl(ud_c, cb);\n\n  _wings_assert_binary(ud_c->is_header_only);\n  assert(ud_c->max_pcie_bcast_batch <= crds_per_channel);\n  assert(ud_c->is_header_only == 0 || ud_c->is_header_only);\n}\n\nvoid\nwings_setup_channel_qps_and_recvs_w_shm_key(ud_channel_t** ud_c_array,\n                                            uint16_t ud_c_num,\n                                            dbit_vector_t* shared_rdy_var,\n                                            uint16_t worker_lid,\n                                            uint16_t base_shm_key)\n{\n  uint32_t dgram_buff_size = 0;\n  int* send_q_depths = malloc(ud_c_num * sizeof(int));\n  int* recv_q_depths = malloc(ud_c_num * sizeof(int));\n\n  // Setup Q depths and buff size for incoming pkts\n  for (int i = 0; i < ud_c_num; ++i) {\n    send_q_depths[i] = ud_c_array[i]->send_q_depth;\n    recv_q_depths[i] = ud_c_array[i]->recv_q_depth;\n    dgram_buff_size +=\n        ud_c_array[i]->type == CRD || ud_c_array[i]->is_header_only == 1\n            ? 64\n            : _wings_ud_recv_max_pkt_size(ud_c_array[i]) *\n                  ud_c_array[i]->recv_q_depth;\n  }\n\n  struct hrd_ud_ctrl_blk* cb =\n      hrd_ud_ctrl_blk_init(worker_lid, 0,\n                           -1,  // local_hid, port_index, numa_node_id,\n                           ud_c_num,\n                           dgram_buff_size,  // num_dgram_qps, dgram_buf_size\n                           base_shm_key + worker_lid,  // key\n                           recv_q_depths,\n                           send_q_depths);  // Depth of the dgram RECV, SEND Q\n\n  for (uint8_t i = 0; i < ud_c_num; ++i)\n    ud_c_array[i]->pd = cb->pd;\n\n  _wings_share_qp_info_via_memcached(ud_c_array, ud_c_num, shared_rdy_var,\n                                     worker_lid, cb);\n\n  volatile uint8_t* incoming_reqs_ptr = cb->dgram_buf;\n  for (uint8_t i = 0; i < ud_c_num; ++i) {\n    // Init recv and setup wrs and sgls of ud_channel\n    _wings_ud_channel_init_recv(ud_c_array[i], cb, (uint8_t)i,\n                                incoming_reqs_ptr);\n    incoming_reqs_ptr +=\n        ud_c_array[i]->type == CRD || ud_c_array[i]->is_header_only == 1\n            ? 64\n            : _wings_ud_recv_max_pkt_size(ud_c_array[i]) *\n                  ud_c_array[i]->recv_q_depth;\n  }\n\n  free(send_q_depths);\n  free(recv_q_depths);\n\n  for (int i = 0; i < ud_c_num; ++i)\n    if (ud_c_array[i]->type != CRD) _wings_assertions(ud_c_array[i]);\n\n  sleep(1);  /// Give some leeway to post receives, before start bcasting!\n}\n\nvoid\nwings_setup_channel_qps_and_recvs(ud_channel_t** ud_c_array, uint16_t ud_c_num,\n                                  dbit_vector_t* shared_rdy_var,\n                                  uint16_t worker_lid)\n{\n  wings_setup_channel_qps_and_recvs_w_shm_key(\n      ud_c_array, ud_c_num, shared_rdy_var, worker_lid, BASE_SHM_KEY);\n}\n\nvoid\nwings_print_ud_c_overview(ud_channel_t* ud_c)\n{\n  printf(\"%s Channel[%d] %s(%d) --> %s\\n\",\n         ud_c->is_bcast_channel ? \"Bcast\" : \"Unicast\", ud_c->channel_id,\n         ud_c->qp_name, ud_c->qp_id, ud_c->type == REQ ? \"REQ\" : \"RESP\");\n\n  _wings_print_on_off_toggle(ud_c->is_inlining_enabled, \"Inlining\");\n  _wings_print_on_off_toggle(ud_c->max_coalescing, \"Coalescing\");\n  _wings_print_on_off_toggle(ud_c->max_pcie_bcast_batch, \"Max PCIe batch\");\n\n  printf(\"\\t\\tMax msg size: %dB\\n\", ud_c->max_msg_size);\n  if (ud_c->type != CRD && !ud_c->is_header_only)\n    printf(\"\\t\\tMax pkt size: send = %dB, recv = %dB\\n\",\n           _wings_ud_send_max_pkt_size(ud_c),\n           _wings_ud_recv_max_pkt_size(ud_c));\n  else\n    printf(\n        \"\\t\\tMax pkt size: send = 4B (inlined_payload), recv = \"\n        \"4B(inlined_payload)\\n\");\n  printf(\"\\t\\tSS granularity: %d\\n\", ud_c->ss_granularity);\n\n  printf(\"\\t\\tNum remotes: %d\\n\", ud_c->num_channels - 1);\n  if (ud_c->disable_crd_ctrl)\n    printf(\"\\t\\tCredits: OFF \\n\");\n  else\n    printf(\"\\t\\tCredits: %d (%s) --> %s (%d)\\n\", ud_c->num_crds_per_channel,\n           ud_c->expl_crd_ctrl ? \"Explicit\" : \"Implicit\",\n           ud_c->channel_providing_crds->qp_name,\n           ud_c->channel_providing_crds->qp_id);\n\n  printf(\"\\t\\tSend Q len: %d\\n\", ud_c->send_q_depth);\n  printf(\"\\t\\tRecv Q len: %d\\n\", ud_c->recv_q_depth);\n\n  printf(\"\\t\\tSend wr len: %d\\n\", ud_c->max_send_wrs);\n  printf(\"\\t\\tRecv wr len: %d\\n\", ud_c->max_recv_wrs);\n\n  printf(\"\\t\\tSend pkt len: %d\\n\", ud_c->send_pkt_buff_len);\n  printf(\"\\t\\tRecv pkt len: %d\\n\", ud_c->recv_pkt_buff_len);\n\n  _wings_print_on_off_toggle(ud_c->enable_stats, \"Stats\");\n  _wings_print_on_off_toggle(ud_c->enable_prints, \"Prints\");\n}\n\n/* ---------------------------------------------------------------------------\n----------------------------------- SETUPs ------------------------------------\n---------------------------------------------------------------------------*/\nvoid\n_wings_print_on_off_toggle(uint16_t bin_flag, char* str)\n{\n  if (bin_flag > 1)\n    printf(\"\\t\\t%s : %s (%d)\\n\", str, \"\\033[1m\\033[32mOn\\033[0m\", bin_flag);\n  else\n    printf(\"\\t\\t%s : %s\\n\", str,\n           bin_flag ? \"\\033[1m\\033[32mOn\\033[0m\" : \"\\033[31mOff\\033[0m\");\n}\n\nvoid\n_wings_ud_channel_crd_init(ud_channel_t* ud_c, char* qp_name,\n                           ud_channel_t* linked_channel,\n                           uint16_t crds_per_channel, uint16_t num_channels,\n                           uint8_t channel_id, uint8_t enable_stats,\n                           uint8_t enable_prints)\n{\n  assert(channel_id < num_channels);\n\n  _wings_assert_binary(enable_stats);\n  _wings_assert_binary(enable_prints);\n\n  ud_c->type = CRD;\n  ud_c->qp_name =\n      malloc(sizeof(char) *\n             (strlen(qp_name) +\n              1));  // TODO make sure to destroy this when destroing a crd_ud_c\n  strcpy(ud_c->qp_name, qp_name);\n\n  ud_c->channel_id = channel_id;\n  ud_c->num_channels = num_channels;  // num_channels include our own channel\n  ud_c->expl_crd_ctrl = 1;\n  ud_c->disable_crd_ctrl = 0;\n  ud_c->is_bcast_channel = 0;\n  ud_c->max_pcie_bcast_batch = 0;\n  ud_c->num_crds_per_channel = crds_per_channel;\n  ud_c->channel_providing_crds = linked_channel;\n\n  ud_c->enable_stats = enable_stats;\n  ud_c->enable_prints = enable_prints;\n\n  static_assert(sizeof(wings_crd_t) <= 4,\n                \"\");         // Credits are always send as inlined_payload <=4B\n  ud_c->max_msg_size = 0;    // non inlined_payload size\n  ud_c->small_msg_size = 0;  // non inlined_payload size\n  ud_c->max_coalescing = 1;\n\n  ud_c->no_crds_to_send_per_endpoint = malloc(sizeof(uint16_t) * num_channels);\n\n  uint16_t remote_channels = (uint16_t)(num_channels - 1);\n  ud_c->is_inlining_enabled = 1;\n\n  ud_c->credits_per_channels = malloc(sizeof(uint16_t) * (num_channels));\n  for (int i = 0; i < num_channels; ++i)\n    ud_c->credits_per_channels[i] = 0;\n\n  ud_c->max_recv_wrs = crds_per_channel * remote_channels;\n  ud_c->max_send_wrs = crds_per_channel * remote_channels;  // TODO correct this\n\n  ud_c->ss_granularity = ud_c->max_send_wrs;\n\n  ud_c->recv_q_depth = ud_c->max_recv_wrs;\n  ud_c->send_q_depth = (uint16_t)(2 * ud_c->ss_granularity);\n\n  ud_c->recv_wc = malloc(sizeof(struct ibv_wc) * ud_c->max_recv_wrs);\n\n  ud_c->recv_pkt_buff_len =\n      ud_c->max_recv_wrs * ud_c->max_coalescing;  // TODO: is this correct?\n  ud_c->send_pkt_buff_len = ud_c->max_send_wrs;\n\n  ud_c->send_pkt_buff = NULL;  // malloc(_wings_ud_send_max_pkt_size(ud_c) *\n                               // ud_c->send_pkt_buff_len);\n\n  ud_c->send_mem_region = NULL;\n\n  ud_c->send_push_ptr = 0;\n  ud_c->recv_push_ptr = 0;\n  ud_c->recv_pull_ptr = -1;\n\n  ud_c->total_pkts_send = 0;\n\n  ud_c->stats.ss_completions = 0;\n  ud_c->stats.recv_total_pkts = 0;\n  ud_c->stats.recv_total_msgs = 0;\n  ud_c->stats.send_total_msgs = 0;\n  ud_c->stats.send_total_pkts = 0;\n  ud_c->stats.send_total_pcie_batches = 0;\n  ud_c->stats.no_stalls_due_to_credits = 0;\n\n  ud_c->remote_qps = malloc(sizeof(qp_info_t) * ud_c->num_channels);\n  // The following are set by the *_init_recv function after the creation of\n  // control block and QPs\n  ud_c->qp = NULL;\n  ud_c->pd = NULL;\n  ud_c->qp_id = 0;\n  ud_c->send_cq = NULL;\n  ud_c->recv_cq = NULL;\n  ud_c->recv_pkt_buff = NULL;\n  //\t_wings_setup_crd_wr_and_sgl(ud_c, cb);\n}\n\nvoid\n_wings_ud_channel_init_recv(ud_channel_t* ud_c, struct hrd_ud_ctrl_blk* cb,\n                            uint8_t qp_id, volatile uint8_t* incoming_reqs_ptr)\n{\n  ud_c->qp_id = qp_id;\n  ud_c->qp = cb->dgram_qp[qp_id];\n\n  ud_c->recv_pkt_buff = incoming_reqs_ptr;\n\n  ud_c->send_cq = cb->dgram_send_cq[ud_c->qp_id];\n  ud_c->recv_cq = cb->dgram_recv_cq[ud_c->qp_id];\n\n  if (ud_c->type != CRD) {\n    ud_c->send_mem_region =\n        ud_c->is_inlining_enabled\n            ? NULL\n            : register_buffer(\n                  cb->pd, ud_c->send_pkt_buff,\n                  _wings_ud_send_max_pkt_size(ud_c) * ud_c->send_pkt_buff_len);\n    _wings_setup_send_wr_and_sgl(ud_c);\n    _wings_setup_recv_wr_and_sgl(ud_c, cb);\n  } else\n    _wings_setup_crd_wr_and_sgl(ud_c, cb);\n\n  // post initial receivs\n  /// WARNING try to avoid races of posting initial receives and sending msgs\n  _wings_setup_incoming_buff_and_post_initial_recvs(ud_c);\n}\n\nvoid\n_wings_setup_crd_wr_and_sgl(ud_channel_t* ud_c, struct hrd_ud_ctrl_blk* cb)\n{\n  assert(ud_c->type == CRD);\n\n  // Credit Send WRs / sgl\n  wings_crd_t crd_tmp;\n  crd_tmp.crd_num = 0;\n  crd_tmp.sender_id = (uint8_t)ud_c->channel_id;\n\n  ud_c->send_sgl = malloc(sizeof(struct ibv_sge));\n  ud_c->send_sgl->length = 0;\n\n  ud_c->send_wr = malloc(sizeof(struct ibv_send_wr) * ud_c->max_send_wrs);\n  for (int i = 0; i < ud_c->max_send_wrs; ++i) {\n    ud_c->send_wr[i].opcode = IBV_WR_SEND_WITH_IMM;\n    ud_c->send_wr[i].num_sge = 0;\n    ud_c->send_wr[i].sg_list = ud_c->send_sgl;\n    ud_c->send_wr[i].wr.ud.remote_qkey = HRD_DEFAULT_QKEY;\n    ud_c->send_wr[i].next = NULL;\n    ud_c->send_wr[i].send_flags = IBV_SEND_INLINE;\n    ud_c->send_wr[i].imm_data = 0;\n    memcpy(&ud_c->send_wr[i].imm_data, &crd_tmp, sizeof(wings_crd_t));\n  }\n\n  // Credit Recv WRs / sgl\n  ud_c->recv_sgl = malloc(sizeof(struct ibv_sge));\n  ud_c->recv_sgl->length = 64;  // TODO can we make this zero?\n  ud_c->recv_sgl->lkey = cb->dgram_buf_mr->lkey;\n  ud_c->recv_sgl->addr = (uint64_t)ud_c->recv_pkt_buff;\n\n  ud_c->recv_wr = malloc(sizeof(struct ibv_recv_wr) * ud_c->max_recv_wrs);\n  for (int i = 0; i < ud_c->max_recv_wrs; ++i) {\n    ud_c->recv_wr[i].num_sge = 1;\n    ud_c->recv_wr[i].sg_list = ud_c->recv_sgl;\n  }\n}\n\nvoid\n_wings_setup_send_wr_and_sgl(ud_channel_t* ud_c)\n{\n  assert(ud_c->type != CRD);\n\n  wings_hdr_only_t hdr_only_tmp;\n  hdr_only_tmp.sender_id = (uint8_t)ud_c->channel_id;\n  memset(hdr_only_tmp.inlined_payload, 0, 3 * sizeof(uint8_t));\n\n  if (ud_c->is_bcast_channel) {  // Send bcast WRs\n\n    uint16_t remote_channels = (uint16_t)(ud_c->num_channels - 1);\n    uint16_t max_msgs_in_pcie_batch =\n        (uint16_t)(ud_c->max_pcie_bcast_batch * remote_channels);\n    ud_c->send_wr = malloc(sizeof(struct ibv_send_wr) * max_msgs_in_pcie_batch);\n    ud_c->send_sgl =\n        malloc(sizeof(struct ibv_sge) *\n               (ud_c->is_header_only == 1 ? 1 : ud_c->max_pcie_bcast_batch));\n\n    if (ud_c->is_header_only)\n      ud_c->send_sgl->length = 0;\n    else\n      for (int i = 0; i < ud_c->max_pcie_bcast_batch; ++i)\n        ud_c->send_sgl[i].length = _wings_ud_send_max_pkt_size(ud_c);\n\n    for (int i = 0; i < max_msgs_in_pcie_batch; ++i) {\n      int sgl_index = i / remote_channels;\n      int i_mod_bcast = i % remote_channels;\n\n      uint16_t rm_qp_id;\n      if (i_mod_bcast < ud_c->channel_id)\n        rm_qp_id = (uint16_t)i_mod_bcast;\n      else\n        rm_qp_id = (uint16_t)((i_mod_bcast + 1) % ud_c->num_channels);\n\n      ud_c->send_wr[i].wr.ud.remote_qkey = HRD_DEFAULT_QKEY;\n      ud_c->send_wr[i].wr.ud.ah = ud_c->remote_qps[rm_qp_id].ah;\n      ud_c->send_wr[i].wr.ud.remote_qpn = ud_c->remote_qps[rm_qp_id].qpn;\n\n      if (!ud_c->is_header_only) {\n        ud_c->send_wr[i].num_sge = 1;\n        ud_c->send_wr[i].opcode =\n            IBV_WR_SEND;  /// Attention!! there is no immediate here\n        ud_c->send_wr[i].sg_list = &ud_c->send_sgl[sgl_index];\n\n      } else {\n        ud_c->send_wr[i].next = NULL;\n        ud_c->send_wr[i].imm_data = 0;\n        ud_c->send_wr[i].num_sge = 0;\n        ud_c->send_wr[i].sg_list = ud_c->send_sgl;\n        ud_c->send_wr[i].opcode = IBV_WR_SEND_WITH_IMM;\n        memcpy(&ud_c->send_wr[i].imm_data, &hdr_only_tmp,\n               sizeof(wings_hdr_only_t));\n      }\n\n      if (!ud_c->is_inlining_enabled) {\n        ud_c->send_wr[i].send_flags = 0;\n        ud_c->send_sgl[sgl_index].lkey = ud_c->send_mem_region->lkey;\n      } else\n        ud_c->send_wr[i].send_flags = IBV_SEND_INLINE;\n\n      ud_c->send_wr[i].next =\n          (i_mod_bcast == remote_channels - 1) ? NULL : &ud_c->send_wr[i + 1];\n    }\n\n  } else {  // Send unicast WRs\n\n    ud_c->send_wr = malloc(sizeof(struct ibv_send_wr) * ud_c->max_send_wrs);\n    ud_c->send_sgl = malloc(sizeof(struct ibv_sge) *\n                            (ud_c->is_header_only ? 1 : ud_c->max_send_wrs));\n    for (int i = 0; i < ud_c->max_send_wrs; ++i) {\n      ud_c->send_wr[i].wr.ud.remote_qkey = HRD_DEFAULT_QKEY;\n\n      if (!ud_c->is_header_only) {\n        //\t\t\t\tud_c->send_sgl[i].length =\n        // sizeof(wings_pkt_t) + _wings_ud_recv_max_pkt_size(ud_c);\n        ud_c->send_sgl[i].length = _wings_ud_send_max_pkt_size(ud_c);\n        ud_c->send_wr[i].num_sge = 1;\n        ud_c->send_wr[i].opcode =\n            IBV_WR_SEND;  /// Attention!! there is no immediate here\n        ud_c->send_wr[i].sg_list = &ud_c->send_sgl[i];\n\n      } else {\n        ud_c->send_sgl->length = 0;\n        ud_c->send_wr[i].next = NULL;\n        ud_c->send_wr[i].imm_data = 0;\n        ud_c->send_wr[i].num_sge = 0;\n        ud_c->send_wr[i].sg_list = ud_c->send_sgl;\n        ud_c->send_wr[i].opcode = IBV_WR_SEND_WITH_IMM;\n        memcpy(&ud_c->send_wr[i].imm_data, &hdr_only_tmp,\n               sizeof(wings_hdr_only_t));\n      }\n\n      if (!ud_c->is_inlining_enabled) {\n        ud_c->send_wr[i].send_flags = 0;\n        ud_c->send_sgl[i].lkey = ud_c->send_mem_region->lkey;\n      } else\n        ud_c->send_wr[i].send_flags = IBV_SEND_INLINE;\n    }\n  }\n}\n\nvoid\n_wings_setup_recv_wr_and_sgl(ud_channel_t* ud_c, struct hrd_ud_ctrl_blk* cb)\n{\n  assert(ud_c->type != CRD);\n\n  ud_c->recv_sgl = malloc(sizeof(struct ibv_sge) *\n                          (ud_c->is_header_only == 1 ? 1 : ud_c->max_recv_wrs));\n\n  if (ud_c->is_header_only) {\n    ud_c->recv_sgl->length = 64;  // TODO can we make this zero?\n    ud_c->recv_sgl->lkey = cb->dgram_buf_mr->lkey;\n    ud_c->recv_sgl->addr = (uint64_t)ud_c->recv_pkt_buff;\n  }\n\n  ud_c->recv_wr = malloc(sizeof(struct ibv_recv_wr) * ud_c->max_recv_wrs);\n  for (int i = 0; i < ud_c->max_recv_wrs; i++) {\n    if (!ud_c->is_header_only) {\n      ud_c->recv_sgl[i].lkey = cb->dgram_buf_mr->lkey;\n      ud_c->recv_sgl[i].length = _wings_ud_recv_max_pkt_size(ud_c);\n    }\n\n    ud_c->recv_wr[i].num_sge = 1;\n    ud_c->recv_wr[i].next =\n        (i == ud_c->max_recv_wrs - 1) ? NULL : &ud_c->recv_wr[i + 1];\n    ud_c->recv_wr[i].sg_list =\n        ud_c->is_header_only == 1 ? ud_c->recv_sgl : &ud_c->recv_sgl[i];\n  }\n}\n\nvoid\n_wings_setup_incoming_buff_and_post_initial_recvs(ud_channel_t* ud_c)\n{\n  if (ud_c->is_header_only == 0 && ud_c->type != CRD) {\n    // init recv buffs as empty (not need for CRD since CRD msgs are\n    // --inlined_payload-- header-only)\n    for (uint16_t i = 0; i < ud_c->send_pkt_buff_len; ++i)\n      _wings_get_nth_pkt_ptr_from_send_buff(ud_c, i)->req_num = 0;\n    for (uint16_t i = 0; i < ud_c->recv_pkt_buff_len; ++i)\n      _wings_get_nth_pkt_ptr_from_recv_buff(ud_c, i)->pkt.req_num = 0;\n  }\n\n  if (WINGS_ENABLE_POST_RECV_PRINTS && ud_c->enable_prints)\n    colored_printf(YELLOW, \"vvv Post Initial Receives: %s %d\\n\", ud_c->qp_name,\n                   ud_c->max_recv_wrs);\n\n  if (ud_c->is_header_only == 0 && ud_c->type != CRD)\n    _wings_post_recvs(ud_c, ud_c->max_recv_wrs);\n  else\n    _wings_post_hdr_only_recvs(ud_c, ud_c->max_recv_wrs);\n}\n\n/* ---------------------------------------------------------------------------\n   -------------------------------- QP Sharing -------------------------------\n   ---------------------------------------------------------------------------\n */\nunsigned long\n_wings_simple_hash(unsigned char* str)\n{\n  int c;\n  unsigned long hash = 5381;\n\n  while (c = *str++)\n    hash = ((hash << 5) + hash) + c;  // hash * 33 + c\n  return hash;\n}\n\nvoid\n_wings_get_remote_qp(ud_channel_t* ud_c, uint8_t endpoint_id)\n{\n  int ib_port_index = 0;\n  int local_port_i = ib_port_index;\n  char qp_global_name[HRD_QP_NAME_SIZE];\n  struct hrd_qp_attr*\n      qp;  //= malloc(sizeof(struct hrd_qp_attr*) * max_remote_channels);\n  sprintf(qp_global_name, \"%lu-%d\",\n          _wings_simple_hash((unsigned char*)ud_c->qp_name), endpoint_id);\n  // Get the UD queue pair for the ith machine\n  qp = NULL;\n  //\tyellow_printf(\"Looking for %s\\n\", qp_global_name);\n  while (qp == NULL) {\n    qp = hrd_get_published_qp(qp_global_name);\n\n    if (qp == NULL) usleep(200000);\n  }\n  //\tgreen_printf(\"Found %s\\n\", qp_global_name);\n\n  struct ibv_ah_attr ah_attr = {\n      //-----INFINIBAND----------\n      .is_global = 0,\n      .dlid = (uint16_t)qp->lid,\n      .sl = (uint8_t)qp->sl,\n      .src_path_bits = 0,\n      /* port_num (> 1): device-local port for responses to this worker */\n      .port_num = (uint8_t)(local_port_i + 1),\n  };\n\n  if (is_roce == 1) {\n    //-----RoCE----------\n    ah_attr.is_global = 1;\n    ah_attr.dlid = 0;\n    ah_attr.grh.dgid.global.interface_id = qp->gid_global_interface_id;\n    ah_attr.grh.dgid.global.subnet_prefix = qp->gid_global_subnet_prefix;\n    ah_attr.grh.sgid_index = 0;\n    ah_attr.grh.hop_limit = 1;\n  }\n\n  ud_c->remote_qps[endpoint_id].qpn = (uint32_t)qp->qpn;\n  ud_c->remote_qps[endpoint_id].ah = ibv_create_ah(ud_c->pd, &ah_attr);\n  assert(ud_c->remote_qps[endpoint_id].ah != NULL);\n}\n\nvoid\n_wings_get_remote_qps(ud_channel_t** ud_c_array, uint16_t ud_c_num)\n{\n  uint16_t max_remote_channels = 0;\n  for (int i = 0; i < ud_c_num; ++i)\n    if (ud_c_array[i]->num_channels > max_remote_channels)\n      max_remote_channels = ud_c_array[i]->num_channels;\n\n  for (int i = 0; i < ud_c_num; ++i)\n    for (int j = 0; j < ud_c_array[i]->num_channels; ++j) {\n      if (j == ud_c_array[i]->channel_id)\n        continue;  // skip the local channel id\n      _wings_get_remote_qp(ud_c_array[i], (uint8_t)j);\n    }\n}\n\nvoid\n_wings_share_qp_info_via_memcached(ud_channel_t** ud_c_array, uint16_t ud_c_num,\n                                   dbit_vector_t* shared_rdy_var,\n                                   int worker_lid, struct hrd_ud_ctrl_blk* cb)\n{\n  for (int i = 0; i < ud_c_num; i++) {\n    char qp_global_name[HRD_QP_NAME_SIZE];\n    sprintf(qp_global_name, \"%lu-%d\",\n            _wings_simple_hash((unsigned char*)ud_c_array[i]->qp_name),\n            ud_c_array[i]->channel_id);\n    hrd_publish_dgram_qp(cb, i, qp_global_name, WORKER_SL);\n    //\t\tyellow_printf(\"Publishing: %s (qpname: %s)\\n\",  qp_global_name,\n    // ud_c_array[i]->qp_name);\n  }\n\n  _wings_get_remote_qps(ud_c_array, ud_c_num);\n  if (shared_rdy_var == NULL) {\n    assert(worker_lid == 0);\n    return;\n  }\n\n  assert(dbv_bit_get(*shared_rdy_var, worker_lid) == 0);\n  dbv_bit_set(shared_rdy_var, (uint8_t)worker_lid);\n\n  // WARNING (global) shared_rdy_var which is used as a g_share_qs_barrier must\n  // be len of num_workers + 1\n  while (!dbv_is_all_set(*shared_rdy_var))\n    usleep(20000);\n\n  assert(dbv_is_all_set(*shared_rdy_var));\n}\n\nvoid\nwings_reconfigure_wrs_ah(ud_channel_t* ud_c, uint8_t endpoint_id)\n{\n  _wings_get_remote_qp(ud_c, endpoint_id);\n  if (!ud_c->disable_crd_ctrl)\n    _wings_get_remote_qp(ud_c->channel_providing_crds, endpoint_id);\n\n  /// TODO WARNING: this is untested and assumes that we always send to everyone\n  if (ud_c->is_bcast_channel) {\n    uint16_t remote_channels = (uint16_t)(ud_c->num_channels - 1);\n    uint16_t max_msgs_in_pcie_batch =\n        (uint16_t)(ud_c->max_pcie_bcast_batch * remote_channels);\n    for (int i = 0; i < max_msgs_in_pcie_batch; ++i) {\n      int i_mod_bcast = i % remote_channels;\n\n      uint16_t rm_qp_id;\n      if (i_mod_bcast < ud_c->channel_id)\n        rm_qp_id = (uint16_t)i_mod_bcast;\n      else\n        rm_qp_id = (uint16_t)((i_mod_bcast + 1) % ud_c->num_channels);\n\n      ud_c->send_wr[i].wr.ud.ah = ud_c->remote_qps[rm_qp_id].ah;\n      ud_c->send_wr[i].wr.ud.remote_qpn = ud_c->remote_qps[rm_qp_id].qpn;\n    }\n  }\n}\n"
  },
  {
    "path": "tla/Hermes.tla",
    "content": "------------------------------- MODULE Hermes -------------------------------\nEXTENDS     Integers,\n            FiniteSets\n\nCONSTANTS   H_NODES,\n            H_MAX_VERSION\n            \nVARIABLES   msgs,\n            nodeTS,\n            nodeState, \n            nodeRcvedAcks,\n            nodeLastWriter,\n            nodeLastWriteTS,\n            nodeWriteEpochID, \n            aliveNodes,\n            epochID \n            \n\\* all Hermes (+ environment) variables\nhvars == << msgs, nodeTS, nodeState, nodeRcvedAcks, nodeLastWriter, \n            nodeLastWriteTS, nodeWriteEpochID, aliveNodes, epochID >>\n\n-------------------------------------------------------------------------------------\nHMessage ==  \\* Messages exchanged by the Protocol   \n    [type: {\"INV\", \"ACK\"}, sender    : H_NODES,\n                           epochID   : 0..(Cardinality(H_NODES) - 1),\n                           version   : 0..H_MAX_VERSION,  \n                           tieBreaker: H_NODES] \n    \\* Note that we need not send Value w/ INVs, timestamp suffice to check consistency\n        \\union\n\n    [type: {\"VAL\"},        \\* optimization: epochID is not required for VALs\n                           \\* epochID   : 0..(Cardinality(H_NODES) - 1),\n                           version   : 0..H_MAX_VERSION, \n                           tieBreaker: H_NODES] \n\nHTypeOK ==  \\* The type correctness invariant\n    /\\  msgs            \\subseteq HMessage\n    /\\ \\A n \\in H_NODES: nodeRcvedAcks[n] \\subseteq (H_NODES \\ {n})\n    /\\  nodeLastWriter  \\in [H_NODES -> H_NODES]\n    /\\  nodeLastWriteTS \\in [H_NODES -> [version   : 0..H_MAX_VERSION,\n                                       tieBreaker: H_NODES         ]]\n    /\\  nodeTS          \\in [H_NODES -> [version   : 0..H_MAX_VERSION,\n                                       tieBreaker: H_NODES         ]]\n    /\\  nodeState       \\in [H_NODES -> {\"valid\", \"invalid\", \"invalid_write\", \n                                         \"write\", \"replay\"}]\n    \\*  membership and epoch id related\n    /\\  aliveNodes      \\subseteq H_NODES\n    /\\  epochID         \\in 0..(Cardinality(H_NODES) - 1)\n    /\\  nodeWriteEpochID \\in [H_NODES -> 0..(Cardinality(H_NODES) - 1)]\n                                              \n\n\\* The consistent invariant: all alive nodes in valid state should have the same value / TS         \nHConsistent == \n    \\A k,s \\in aliveNodes:  \\/ nodeState[k] /= \"valid\"\n                            \\/ nodeState[s] /= \"valid\" \n                            \\/ nodeTS[k] = nodeTS[s]\n                                              \nHInit == \\* The initial predicate\n    /\\  msgs            = {}\n    \\*  membership and epoch id related\n    /\\  epochID         = 0\n    /\\  aliveNodes      = H_NODES\n    /\\  nodeWriteEpochID = [n \\in H_NODES |-> 0]\n    \\*  Init rest per node replica metadata\n    /\\  nodeRcvedAcks   = [n \\in H_NODES |-> {}]\n    /\\  nodeState       = [n \\in H_NODES |-> \"valid\"]\n    /\\  nodeLastWriter  = [n \\in H_NODES |-> CHOOSE k \\in H_NODES:\n                                             \\A m \\in H_NODES: k <= m]\n    /\\  nodeTS          = [n \\in H_NODES |-> [version |-> 0, \n                                              tieBreaker |-> \n                                              CHOOSE k \\in H_NODES: \n                                               \\A m \\in H_NODES: k <= m]]\n    /\\  nodeLastWriteTS = [n \\in H_NODES |-> [version |-> 0, \n                                              tieBreaker |-> \n                                              CHOOSE k \\in H_NODES: \n                                               \\A m \\in H_NODES: k <= m]]\n                                               \n-------------------------------------------------------------------------------------\n\n\\* A buffer maintaining all network messages. Messages are only appended to this variable (not \n\\* removed once delivered) intentionally to check protocols tolerance in dublicates and reorderings \nsend(m) == msgs' = msgs \\union {m}\n\n\\* Check if all acknowledgments for a write have been received                                                  \nreceivedAllAcks(n) == (aliveNodes \\ {n}) \\subseteq nodeRcvedAcks[n]\n        \nequalTS(v1,tb1,v2,tb2) ==  \\* Timestamp equality\n    /\\ v1 = v2\n    /\\ tb1 = tb2\n\ngreaterTS(v1,tb1,v2,tb2) == \\* Timestamp comparison\n    \\/ v1 > v2\n    \\/ /\\   v1 = v2\n       /\\  tb1 > tb2\n       \nisAlive(n) == n \\in aliveNodes\n                   \nnodeFailure(n) == \\* Emulate a node failure\n\\*    Make sure that there are atleast 3 alive nodes before killing a node\n    /\\ Cardinality(aliveNodes) > 2\n    /\\ nodeRcvedAcks' = [k \\in H_NODES |-> {}]\n    /\\ aliveNodes'    = aliveNodes \\ {n}\n    /\\ epochID'       = epochID + 1\n    /\\ UNCHANGED <<msgs, nodeState, nodeTS, nodeLastWriter, \n                   nodeLastWriteTS, nodeWriteEpochID>>\n\nh_upd_not_aliveNodes ==\n    /\\  UNCHANGED <<aliveNodes, epochID, nodeWriteEpochID>>\n    \n    \nh_upd_aliveNodes ==\n    /\\ UNCHANGED <<msgs, nodeState, nodeTS, nodeLastWriter, nodeLastWriteTS, nodeRcvedAcks>>\n                   \nh_upd_nothing ==                    \n    /\\ h_upd_not_aliveNodes\n    /\\ h_upd_aliveNodes\n    \n-------------------------------------------------------------------------------------\n\nh_upd_state(n, newVersion, newTieBreaker, newState, newAcks) == \n    /\\  nodeLastWriter'   = [nodeLastWriter  EXCEPT ![n] = n]\n    /\\  nodeRcvedAcks'    = [nodeRcvedAcks   EXCEPT ![n] = newAcks]\n    /\\  nodeState'        = [nodeState       EXCEPT ![n] = newState]\n    /\\  nodeWriteEpochID' = [nodeWriteEpochID EXCEPT ![n] = epochID] \\* we always use the latest epochID\n    /\\  nodeTS'           = [nodeTS          EXCEPT ![n].version    = newVersion, \n                                                    ![n].tieBreaker = newTieBreaker]\n    /\\  nodeLastWriteTS'  = [nodeLastWriteTS EXCEPT ![n].version    = newVersion, \n                                                    ![n].tieBreaker = newTieBreaker]\n                                            \nh_send_inv_or_ack(n, newVersion, newTieBreaker, msgType) ==  \n    /\\  send([type        |-> msgType,\n              epochID     |-> epochID, \\* we always use the latest epochID\n              sender      |-> n,\n              version     |-> newVersion, \n              tieBreaker  |-> newTieBreaker])              \n\nh_actions_for_upd(n, newVersion, newTieBreaker, newState, newAcks) == \\* Execute a write\n    /\\  h_upd_state(n, newVersion, newTieBreaker, newState, newAcks)\n    /\\  h_send_inv_or_ack(n, newVersion, newTieBreaker, \"INV\")\n    /\\  UNCHANGED <<aliveNodes, epochID>>\n \n\nh_actions_for_upd_replay(n, acks) == \\* Apply a write-replay using same TS (version, tie-breaker) \n                                     \\* and either reset acks or keep already gathered acks\n    /\\  h_actions_for_upd(n, nodeTS[n].version, nodeTS[n].tieBreaker, \"replay\", acks)\n\n-------------------------------------------------------------------------------------\n\nHRead(n) ==  \\* Execute a read\n    /\\ nodeState[n] = \"valid\"\n    /\\ h_upd_nothing\n              \nHWrite(n) == \\* Execute a write\n\\*    /\\  nodeState[n]      \\in {\"valid\", \"invalid\"} \n    \\* writes in invalid state are also supported as an optimization\n    /\\  nodeState[n]      \\in {\"valid\"}\n    /\\  nodeTS[n].version < H_MAX_VERSION \\* Only to configurably terminate the model checking \n    /\\  h_actions_for_upd(n, nodeTS[n].version + 1, n, \"write\", {})\n\n\nHCoordWriteReplay(n) == \\* Execute a write-replay after a membership re-config\n    /\\  nodeState[n] \\in {\"write\", \"replay\"}\n    /\\  nodeWriteEpochID[n] < epochID\n    /\\  ~receivedAllAcks(n) \\* optimization to not replay when we have gathered acks from all alive\n    /\\  h_actions_for_upd_replay(n, nodeRcvedAcks[n])\n\n\nHRcvAck(n) ==   \\* Process a received acknowledment\n    \\E m \\in msgs: \n        /\\ m.type     = \"ACK\"\n        /\\ m.epochID  = epochID\n        /\\ m.sender  /= n\n        /\\ m.sender  \\notin nodeRcvedAcks[n]\n        /\\ equalTS(m.version, m.tieBreaker,\n                   nodeLastWriteTS[n].version, \n                   nodeLastWriteTS[n].tieBreaker)\n        /\\ nodeState[n] \\in {\"write\", \"invalid_write\", \"replay\"}\n        /\\ nodeRcvedAcks' = [nodeRcvedAcks EXCEPT ![n] = \n                                              nodeRcvedAcks[n] \\union {m.sender}]\n        /\\ UNCHANGED <<msgs, nodeLastWriter, nodeLastWriteTS, \n                       aliveNodes, nodeTS, nodeState, epochID, nodeWriteEpochID>>\n\n\nHSendVals(n) == \\* Send validations once acknowledments are received from all alive nodes\n    /\\ nodeState[n] \\in {\"write\", \"replay\"}\n    /\\ receivedAllAcks(n)\n    /\\ nodeState'         = [nodeState EXCEPT![n] = \"valid\"]\n    /\\ send([type        |-> \"VAL\", \n             version     |-> nodeTS[n].version, \n             tieBreaker  |-> nodeTS[n].tieBreaker])\n    /\\ UNCHANGED <<nodeTS, nodeLastWriter, nodeLastWriteTS,\n                   aliveNodes, nodeRcvedAcks, epochID, nodeWriteEpochID>>\n \nHCoordinatorActions(n) ==   \\* Actions of a read/write coordinator \n    \\/ HRead(n)          \n    \\/ HCoordWriteReplay(n) \\* After failures\n    \\/ HWrite(n)         \n    \\/ HRcvAck(n)\n    \\/ HSendVals(n) \n\n-------------------------------------------------------------------------------------               \n    \nHRcvInv(n) ==  \\* Process a received invalidation\n    \\E m \\in msgs: \n        /\\ m.type     = \"INV\"\n        /\\ m.epochID  = epochID\n        /\\ m.sender  /= n\n        \\* always acknowledge a received invalidation (irrelevant to the timestamp)\n        /\\ send([type       |-> \"ACK\",\n                 sender     |-> n,   \n                 epochID    |-> epochID,\n                 version    |-> m.version,\n                 tieBreaker |-> m.tieBreaker])\n        /\\ IF greaterTS(m.version, m.tieBreaker,\n                        nodeTS[n].version, nodeTS[n].tieBreaker)\n           THEN   /\\ nodeLastWriter' = [nodeLastWriter EXCEPT ![n] = m.sender]\n                  /\\ nodeTS' = [nodeTS EXCEPT ![n].version    = m.version,\n                                          ![n].tieBreaker = m.tieBreaker]\n                  /\\ IF nodeState[n] \\in {\"valid\", \"invalid\", \"replay\"}\n                     THEN \n                        nodeState' = [nodeState EXCEPT ![n] = \"invalid\"]\n                     ELSE \n                        nodeState' = [nodeState EXCEPT ![n] = \"invalid_write\"] \n           ELSE\n                  UNCHANGED <<nodeState, nodeTS, nodeLastWriter, nodeWriteEpochID>>\n        /\\ UNCHANGED <<nodeLastWriteTS, aliveNodes, nodeRcvedAcks, epochID, nodeWriteEpochID>>\n     \n            \nHRcvVal(n) ==   \\* Process a received validation\n    \\E m \\in msgs: \n        /\\ nodeState[n] /= \"valid\"\n        /\\ m.type = \"VAL\"\n        /\\ equalTS(m.version, m.tieBreaker,\n                   nodeTS[n].version, \n                   nodeTS[n].tieBreaker)\n        /\\ nodeState' = [nodeState EXCEPT ![n] = \"valid\"]\n        /\\ UNCHANGED <<msgs, nodeTS, nodeLastWriter, nodeLastWriteTS,\n                       aliveNodes, nodeRcvedAcks, epochID, nodeWriteEpochID>>\n   \nHFollowerWriteReplay(n) == \\* Execute a write-replay when coordinator failed\n    /\\  nodeState[n] \\in {\"invalid\", \"invalid_write\"}\n    /\\  ~isAlive(nodeLastWriter[n])\n    /\\  h_actions_for_upd_replay(n, {}) \n\n   \nHFollowerActions(n) ==  \\* Actions of a write follower\n    \\/ HRcvInv(n)\n    \\/ HFollowerWriteReplay(n)\n    \\/ HRcvVal(n) \n \n------------------------------------------------------------------------------------- \n\nHNext == \\* Hermes (read/write) protocol (Coordinator and Follower actions) + failures\n    \\E n \\in aliveNodes:       \n            \\/ HFollowerActions(n)\n            \\/ HCoordinatorActions(n)\n            \\/ nodeFailure(n) \n\n\nH_Spec == HInit /\\ [][HNext]_hvars\n\n\nTHEOREM H_Spec =>([]HTypeOK) /\\ ([]HConsistent)\n\n=============================================================================\n"
  },
  {
    "path": "tla/HermesRMWs.tla",
    "content": "------------------------------- MODULE HermesRMWs -------------------------------\nEXTENDS     Hermes\n            \nVARIABLES   Rmsgs,\n            nodeFlagRMW,\n            committedRMWs,\n            committedWrites\n                                 \n\\* all Hermes (+ environment, + RMW) variables\nhrvars == << msgs, nodeTS, nodeState, nodeRcvedAcks, nodeLastWriter, \n             nodeLastWriteTS, nodeWriteEpochID, aliveNodes, epochID,\n             Rmsgs, nodeFlagRMW, committedRMWs, committedWrites >>\n-------------------------------------------------------------------------------------\nHRMessage ==  \\* Invalidation msgs exchanged by the Hermes Protocol w/ RMWs  \n    [type: {\"RINV\"},       flagRMW   : {0,1}, \\* RMW change\n                           epochID   : 0..(Cardinality(H_NODES) - 1),\n                           sender    : H_NODES,\n                           version   : 0..H_MAX_VERSION,\n                           tieBreaker: H_NODES] \n\nHRts == [version: 0..H_MAX_VERSION,\n         tieBreaker: H_NODES]\n\nHRTypeOK ==  \\* The type correctness invariant\n    /\\  HTypeOK\n    /\\  Rmsgs           \\subseteq HRMessage\n    /\\  nodeFlagRMW     \\in [H_NODES -> {0,1}]\n    /\\  committedRMWs   \\subseteq HRts\n    /\\  committedWrites \\subseteq HRts\n    \nHRSemanticsRMW ==  \\* The invariant that an we cannot have two operations committed \n                   \\* with same versions (i.e., that read the same value unless they are both writes)\n    /\\ \\A x \\in committedRMWs:\n        \\A y \\in committedWrites: /\\ x.version /= y.version\n                                  /\\ x.version /= y.version - 1\n    /\\ \\A x,y \\in committedRMWs: \\/ x.version /= y.version\n                                 \\/ x.tieBreaker = y.tieBreaker\nHRInit == \\* The initial predicate\n    /\\  HInit\n    /\\  Rmsgs       = {}\n    /\\  committedRMWs   = {}\n    /\\  committedWrites = {}\n    /\\  nodeFlagRMW = [n \\in H_NODES |-> 0]  \\* RMW change\n    \n    \n-------------------------------------------------------------------------------------\n\n\\* A buffer maintaining all Invalidation  messages. Messages are only appended to this variable (not \n\\* removed once delivered) intentionally to check protocols tolerance in dublicates and reorderings \nHRsend(m) == Rmsgs' = Rmsgs \\union {m}  \n\nhr_upd_nothing ==\n    /\\ UNCHANGED <<nodeFlagRMW, Rmsgs, committedRMWs, committedWrites>>\n\nhr_completeWrite(ver, tieB) ==\n    /\\ committedWrites' = committedWrites \\union {[version |-> ver, tieBreaker |-> tieB]} \n    /\\ UNCHANGED <<Rmsgs, nodeFlagRMW, committedRMWs>>\n\nhr_completeRMW(ver, tieB) ==\n    /\\ committedRMWs' = committedRMWs \\union {[version |-> ver, tieBreaker |-> tieB]} \n    /\\ UNCHANGED <<Rmsgs, nodeFlagRMW, committedWrites>>\n\n\n-------------------------------------------------------------------------------------\n\\* Helper functions \nhr_upd_state(n, newVersion, newTieBreaker, newState, newAcks, flagRMW) == \n    /\\  nodeFlagRMW'      = [nodeFlagRMW     EXCEPT ![n] = flagRMW] \\* RMW change\n    /\\  h_upd_state(n, newVersion, newTieBreaker, newState, newAcks)\n\nhr_send_inv(n, newVersion, newTieBreaker, flagRMW) ==  \n    /\\  HRsend([type        |-> \"RINV\",\n                epochID     |-> epochID, \\* we always use the latest epochID\n                flagRMW     |-> flagRMW, \\* RMW change\n                sender      |-> n,\n                version     |-> newVersion, \n                tieBreaker  |-> newTieBreaker])              \n\nhr_actions_for_upd(n, newVersion, newTieBreaker, newState, newAcks, flagRMW) == \\* Execute a write\n    /\\  hr_upd_state(n, newVersion, newTieBreaker, newState, newAcks, flagRMW)\n    /\\  hr_send_inv(n, newVersion, newTieBreaker, flagRMW)\n    /\\  UNCHANGED <<aliveNodes, epochID, msgs, committedRMWs, committedWrites>>\n \n\nhr_actions_for_upd_replay(n, acks) == \\* Apply a write-replay using same TS (version, Tie Breaker) \n                                \\* and either reset acks or keep already gathered acks\n    /\\  hr_actions_for_upd(n, nodeTS[n].version, nodeTS[n].tieBreaker, \"replay\", acks, nodeFlagRMW[n])\n \n \n-------------------------------------------------------------------------------------\n\\* Coordinator functions \n\nHRWrite(n) == \\* Execute a write\n\\*    /\\  nodeState[n]      \\in {\"valid\", \"invalid\"}\n    \\* writes in invalid state are also supported as an optimization\n    /\\  nodeState[n]            = \"valid\"\n    /\\  nodeTS[n].version + 2 <= H_MAX_VERSION \\* Only to configurably terminate the model checking \n    /\\  hr_actions_for_upd(n, nodeTS[n].version + 2, n, \"write\", {}, 0)\n   \nHRRMW(n) == \\* Execute an RMW\n    /\\  nodeState[n]            = \"valid\"\n    /\\  nodeTS[n].version + 1 <= H_MAX_VERSION \\* Only to configurably terminate the model checking \n    /\\  hr_actions_for_upd(n, nodeTS[n].version + 1, n, \"write\", {}, 1)\n               \nHRWriteReplay(n) == \\* Execute a write-replay\n    /\\  nodeState[n] \\in {\"write\", \"replay\"}\n    /\\  nodeWriteEpochID[n] < epochID\n    /\\  ~receivedAllAcks(n) \\* optimization to not replay when we have gathered acks from all alive\n    /\\  nodeFlagRMW[n] = 0\n    /\\  hr_actions_for_upd_replay(n, nodeRcvedAcks[n])\n\nHRRMWReplay(n) == \\* Execute an RMW-replay\n    /\\  nodeState[n] \\in {\"write\", \"replay\"}\n    /\\  nodeWriteEpochID[n] < epochID\n    /\\  ~receivedAllAcks(n) \\* optimization to not replay when we have gathered acks from all alive\n    /\\  nodeFlagRMW[n] = 1\n    /\\  hr_actions_for_upd_replay(n, {})\n\n\\* Keep the HRead, HRcvAck and HSendVals the same as Hermes w/o RMWs\nHRRead(n) == \n    /\\ HRead(n)\n    /\\ hr_upd_nothing \n    \nHRRcvAck(n) == \n    /\\ HRcvAck(n)\n    /\\ hr_upd_nothing \n    \nHRSendValsRMW(n) == \n    /\\ nodeFlagRMW[n] = 1\n    /\\ HSendVals(n)\n    /\\ hr_completeRMW(nodeTS[n].version, nodeTS[n].tieBreaker)\n\nHRSendValsWrite(n) == \n    /\\ nodeFlagRMW[n] = 0\n    /\\ HSendVals(n)\n    /\\ hr_completeWrite(nodeTS[n].version, nodeTS[n].tieBreaker)\n\nHRCoordinatorActions(n) ==   \\* Actions of a read/write/RMW coordinator \n    \\/ HRRead(n)          \n    \\/ HRRMWReplay(n)\n    \\/ HRWriteReplay(n) \n    \\/ HRWrite(n)      \n    \\/ HRRMW(n)      \n    \\/ HRRcvAck(n)\n    \\/ HRSendValsRMW(n)\n    \\/ HRSendValsWrite(n)\n    \n-------------------------------------------------------------------------------------               \n\\* Follower functions \nhr_upd_state_greater_inv(n) ==\n        IF      nodeState[n] \\in {\"valid\", \"invalid\", \"replay\"}\n        THEN    \n            nodeState' = [nodeState EXCEPT ![n] = \"invalid\"]\n        ELSE IF nodeState[n] \\in {\"write\", \"invalid_write\"} /\\ nodeFlagRMW[n] = 0  \n        THEN\n            nodeState' = [nodeState EXCEPT ![n] = \"invalid_write\"] \n        ELSE \\* nodeState[n] \\in {\"write\"} /\\ nodeFlagRMW[n] = 1 \n            nodeState' = [nodeState EXCEPT ![n] = \"invalid\"]    \n        \n\nHRRcvWriteInv(n) ==  \\* Process a received invalidation for a write\n    \\E m \\in Rmsgs: \n        /\\ m.type = \"RINV\"\n        /\\ m.epochID  = epochID\n        /\\ m.sender /= n\n        /\\ m.flagRMW = 0 \\* RMW change\n        \\* always acknowledge a received invalidation (irrelevant to the timestamp)\n        /\\ h_send_inv_or_ack(n, m.version, m.tieBreaker, \"ACK\") \n        /\\ IF greaterTS(m.version, m.tieBreaker,\n                        nodeTS[n].version, nodeTS[n].tieBreaker)\n           THEN \n                /\\ nodeLastWriter' = [nodeLastWriter EXCEPT ![n] = m.sender]\n                /\\ nodeFlagRMW'    = [nodeFlagRMW    EXCEPT ![n] = m.flagRMW] \\* RMW change            \n                /\\ nodeTS' = [nodeTS EXCEPT ![n].version    = m.version,\n                                          ![n].tieBreaker = m.tieBreaker]\n                /\\ hr_upd_state_greater_inv(n)\n           ELSE\n                /\\ UNCHANGED <<nodeState, nodeTS, nodeLastWriter, nodeFlagRMW>>\n        /\\ UNCHANGED <<nodeLastWriteTS, aliveNodes, nodeRcvedAcks, Rmsgs, \n                       epochID, nodeWriteEpochID, committedRMWs, committedWrites>>\n \nHRRcvRMWInv(n) ==  \\* Process a received invalidation for a write\n    \\E m \\in Rmsgs: \n        /\\ m.type = \"RINV\"\n        /\\ m.epochID  = epochID\n        /\\ m.sender /= n\n        /\\ m.flagRMW = 1        \n        /\\ IF greaterTS(m.version, m.tieBreaker,\n                        nodeTS[n].version, nodeTS[n].tieBreaker)\n           THEN\n                /\\ nodeLastWriter' = [nodeLastWriter EXCEPT ![n] = m.sender]\n                /\\ nodeFlagRMW'    = [nodeFlagRMW    EXCEPT ![n] = m.flagRMW] \\* RMW change            \n                /\\ nodeTS' = [nodeTS EXCEPT ![n].version    = m.version,\n                                          ![n].tieBreaker = m.tieBreaker]\n                \\* acknowledge a received invalidation (w/ greater timestamp)\n                /\\ h_send_inv_or_ack(n, m.version, m.tieBreaker, \"ACK\") \n                /\\ hr_upd_state_greater_inv(n)\n                /\\ UNCHANGED <<Rmsgs>>\n            ELSE IF equalTS(m.version, m.tieBreaker,\n                            nodeTS[n].version, nodeTS[n].tieBreaker)\n            THEN\n                \\* acknowledge a received invalidation (w/ equal timestamp)\n                /\\ h_send_inv_or_ack(n, m.version, m.tieBreaker, \"ACK\") \n                /\\ UNCHANGED <<nodeState, nodeTS, nodeLastWriter, nodeFlagRMW, Rmsgs>>\n            ELSE \\* smaller TS\n                /\\ hr_send_inv(n, nodeTS[n].version, nodeTS[n].tieBreaker, nodeFlagRMW[n])\n                /\\ UNCHANGED <<nodeState, nodeTS, nodeLastWriter, nodeFlagRMW, msgs>>\n        /\\ UNCHANGED <<nodeLastWriteTS, aliveNodes, nodeRcvedAcks, epochID, \n                       nodeWriteEpochID, committedRMWs, committedWrites>> \n \n         \n\\* Keep the HRcvVals the same as Hermes w/o RMWs\nHRRcvVal(n) == \n    /\\ HRcvVal(n)\n    /\\ hr_upd_nothing\n    \n    \nHRFollowerWriteReplay(n) == \\* Execute a write-replay when coordinator failed\n    /\\  nodeState[n] \\in {\"invalid\", \"invalid_write\"}\n    /\\  ~isAlive(nodeLastWriter[n])\n    /\\  hr_actions_for_upd_replay(n, {})\n                           \n\nHRFollowerActions(n) ==  \\* Actions of a write follower\n    \\/ HRFollowerWriteReplay(n)\n    \\/ HRRcvWriteInv(n)\n    \\/ HRRcvRMWInv(n)\n    \\/ HRRcvVal(n) \n-------------------------------------------------------------------------------------                       \n\nHRNodeFailure(n) == \n    /\\ nodeFailure(n)\n    /\\ hr_upd_nothing\n    \n    \nHRNext == \\* Hermes (read,write RMWs) protocol (Coordinator and Follower actions) + failures\n    \\E n \\in aliveNodes:       \n            \\/ HRFollowerActions(n)\n            \\/ HRCoordinatorActions(n)\n            \\/ HRNodeFailure(n) \n            \n            \n\\* Hermes w/ RMW Spec\nHRSpec == HRInit /\\ [][HRNext]_hrvars\nTHEOREM HRSpec =>([]HRTypeOK) /\\ ([]HConsistent) /\\ ([]HRSemanticsRMW)\n\n\\* A hacky way to run Hermes w/o RMWs from the same model\nHSpec == HRInit /\\ [][HNext /\\ hr_upd_nothing]_hrvars\nTHEOREM HSpec =>([]HRTypeOK) /\\ ([]HConsistent)\n\n=============================================================================\n\n"
  },
  {
    "path": "tla/README.md",
    "content": "# Hermes-Protocol\nTLA spec - Hermes: fault-tolerant replication protocol with strong consistency and high performance\n\n---\nWarning \nprotocol-actions png contains some optimizations over the Hermes protocol presented \nin the paper such as issuing writes while being in Invalid state.\n"
  }
]