Repository: ease-lab/Hermes Branch: master Commit: 949229c23881 Files: 56 Total size: 412.5 KB Directory structure: gitextract_es11wv71/ ├── .clang-format ├── .gitignore ├── AUTHORS ├── CMakeLists.txt ├── LICENSE ├── README.md ├── bin/ │ ├── copy-exec-files.sh │ ├── copy-n-exec-hermesKV.sh │ ├── copy-n-exec-rCRAQ.sh │ ├── copy-traces.sh │ ├── csv_latency_parser.py │ ├── exec-derecho.sh │ ├── format.sh │ ├── get-system-xput-files.sh │ ├── setup.sh │ └── trace-spliter.sh ├── exec/ │ ├── Makefile │ ├── hosts.sh │ ├── results/ │ │ ├── latency/ │ │ │ └── .gitinclude │ │ └── xput/ │ │ ├── all-nodes/ │ │ │ └── .gitkeep │ │ └── per-node/ │ │ └── .gitkeep │ ├── run-hades.sh │ ├── run-hermesKV.sh │ ├── run-rCRAQ.sh │ └── run.sh ├── include/ │ ├── hades/ │ │ └── hades.h │ ├── hermes/ │ │ ├── config.h │ │ ├── inline-util.h │ │ ├── spacetime.h │ │ └── util.h │ ├── mica-herd/ │ │ ├── city.h │ │ ├── hrd.h │ │ ├── mica.h │ │ └── sizes.h │ ├── utils/ │ │ ├── bit_vector.h │ │ ├── concur_ctrl.h │ │ └── time_rdtsc.h │ └── wings/ │ ├── wings.h │ └── wings_api.h ├── src/ │ ├── CR/ │ │ ├── crKV.c │ │ └── cr_worker.c │ ├── hades/ │ │ ├── hades.c │ │ └── test.c │ ├── hermes/ │ │ ├── hermesKV.c │ │ ├── hermes_worker.c │ │ ├── main.c │ │ ├── spacetime.c │ │ ├── stats.c │ │ └── util.c │ ├── mica-herd/ │ │ ├── city.c │ │ ├── herd.c │ │ └── mica.c │ └── wings/ │ └── wings.c └── tla/ ├── Hermes.tla ├── HermesRMWs.tla └── README.md ================================================ FILE CONTENTS ================================================ ================================================ FILE: .clang-format ================================================ --- BasedOnStyle: Chromium AlignAfterOpenBracket: Align AlignConsecutiveDeclarations: 'false' AlignEscapedNewlines: Left AlignOperands: 'true' AllowShortFunctionsOnASingleLine: All AllowShortIfStatementsOnASingleLine: WithoutElse AlwaysBreakAfterDefinitionReturnType: TopLevel AlwaysBreakTemplateDeclarations: 'Yes' BinPackArguments: 'true' BinPackParameters: 'true' BreakBeforeBraces: WebKit CompactNamespaces: 'false' Cpp11BracedListStyle: 'true' IndentWrappedFunctionNames: 'false' Language: Cpp NamespaceIndentation: None SpaceAfterTemplateKeyword: 'true' SpaceBeforeAssignmentOperators: 'true' SpaceBeforeCpp11BracedList: 'true' SpaceBeforeParens: ControlStatements SpaceInEmptyParentheses: 'false' SpacesInAngles: 'false' SpacesInParentheses: 'false' SpacesInSquareBrackets: 'false' UseTab: Never ... ================================================ FILE: .gitignore ================================================ # ignore temporary files .*.swp \#*# *.pyc *.o *.hi *.dump *.log *.rej *.orig *.patch *.diff .tags* # ignore executables /src/mica/test /src/libhrd/main /src/herd-hybrid/main /src/herd-UD/main src/Armonia/main /src/CR/cr /src/hermes/hermes /src/hades/hades /src/hermes/hermes-wings # ignore debug files /debug/*.txt # ignore traces /traces/*.txt # ignore ide files /.idea/ /cmake-build-debug/ /src/cmake-build-debug/ /src/.idea/ /src/cache/cmake-build-debug/ /src/cache/.idea/ /src/Armonia/armonia-ec /src/Armonia/armonia-sc /src/Armonia/throughput.txt /src/herd-UD/throughput.txt /bin/traces #/results/* /exec/results/*.txt /exec/results/xput/*.txt /exec/results/xput/*.csv /exec/results/xput/per-node/*.csv /exec/results/xput/per-node/*.txt /exec/results/xput/all-nodes/*.txt /exec/results/latency/*.txt /exec/results/latency/*.csv /results/*.txt /results/xput/*.txt /results/xput/*.csv /results/xput/per-node/*.csv /results/xput/per-node/*.txt /results/xput/all-nodes/*.txt /results/latency/*.txt /results/latency/*.csv traces/trace-parts/* /results/scattered-results/* /results/aggregated-system-results/*.csv /traces/system-traces/*.txt /traces/current-splited-traces/*.txt /traces/*.txt traces/ ./exec/hermesKV ./exec/rCRAQ ./exec/hades ================================================ FILE: AUTHORS ================================================ Run `git shortlog -se` for an up-to-date list of contributors. --- Principal authors: Antonios Katsarakis Vasilis Gavrielatos ================================================ FILE: CMakeLists.txt ================================================ ###################################################################################### # WARNING: DO NOT MAKE through cmake use the Makefile in /exec/ to compile instead!!!! ###################################################################################### cmake_minimum_required(VERSION 2.8.12) project(hermes) set(Hermes_VERSION_MAJOR 1) set(Hermes_VERSION_MINOR 0) include_directories(include/hermes include/libhrd /usr/include/ include/optik include/mica-herd) set(CMAKE_C_STANDARD 11) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") set(SOURCE_FILES_cr #Source files src/CR/cr_worker.c src/wings/wings.c src/hermes/main.c src/hermes/stats.c src/hermes/spacetime.c src/mica-herd/mica.c src/mica-herd/city.c src/mica-herd/herd.c ##### header files #### include/wings/wings.h include/wings/wings_api.h include/mica-herd/city.h include/mica-herd/hrd.h include/mica-herd/sizes.h include/hermes/util.h include/hermes/config.h include/utils/bit_vector.h include/utils/concur_ctrl.h src/CR/crKV.c) set(SOURCE_FILES_hades #Source files src/wings/wings.c src/hades/hades.c ##### header files #### include/wings/wings_api.h include/wings/wings.h include/hades/hades.h src/hades/test.c) set(SOURCE_FILES_hermes #Source files src/hermes/main.c src/hermes/util.c # src/hermes/worker.c src/hermes/hermes_worker.c src/hermes/stats.c src/hermes/spacetime.c src/mica-herd/herd.c src/mica-herd/mica.c src/mica-herd/city.c src/wings/wings.c ##### header files #### include/mica-herd/hrd.h include/mica-herd/city.h include/mica-herd/sizes.h include/hermes/util.h include/hermes/config.h include/utils/concur_ctrl.h include/utils/bit_vector.h include/hades/hades.h include/wings/wings.h include/wings/wings_api.h src/hermes/hermesKV.c) add_executable(cr ${SOURCE_FILES_cr}) add_executable(hades ${SOURCE_FILES_hades}) add_executable(hermes ${SOURCE_FILES_hermes}) target_link_libraries(cr pthread ibverbs rt memcached numa rdmacm) target_link_libraries(hades pthread ibverbs rt memcached numa rdmacm) target_link_libraries(hermes pthread ibverbs rt memcached numa rdmacm) ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # Hermes Reliable Replication Protocol This is the publicly available artifact repository supporting the ASPLOS'20 paper [_"Hermes: A Fast, Fault-Tolerant and Linearizable Replication Protocol"_](http://hermes-protocol.com "Hermes Arxiv version"). The repository contains both code to experimentally evaluate Hermes(KV) and complete Hermes TLA+ specifications which can be used to verify Hermes correctness via model-checking. [![top picks](https://badgen.net/badge/honorable%20mention/top%20picks%20'20/d99e14)](https://www.sigarch.org/call-contributions/ieee-micro-top-picks/) [![available](https://badgen.net/badge/acm%20badge/available/117c00)](https://www.acm.org/publications/policies/artifact-review-badging#available) [![functional](https://badgen.net/badge/acm%20badge/functional/FB1f44)](https://www.acm.org/publications/policies/artifact-review-badging#functional) [![stars](https://badgen.net/github/stars/ease-lab/Hermes)]() [![license](https://badgen.net/badge/webpage/Hermes/blue)](http://hermes-protocol.com/) [![license](https://badgen.net/badge/license/Apache%202.0/blue)](https://github.com/ease-lab/Hermes/blob/master/LICENSE) [![last commit](https://badgen.net/github/last-commit/ease-lab/Hermes)]() follow on Twitter ## Citation ``` @inbook{Katsarakis:20, author = {Katsarakis, Antonios and Gavrielatos, Vasilis and Katebzadeh, M.R. Siavash and Joshi, Arpit and Dragojevic, Aleksandar and Grot, Boris and Nagarajan, Vijay}, title = {Hermes: A Fast, Fault-Tolerant and Linearizable Replication Protocol}, year = {2020}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, booktitle = {Proceedings of the Twenty-Fifth International Conference on Architectural Support for Programming Languages and Operating Systems}, pages = {201–217}, numpages = {17} } ``` ---- ## High Perfomance Features - _Reads_: i) Local ii) Load-balanced (served by any replica) - _Updates (Writes and RMWs)_: i) Inter-key concurrent ii) Decentralized iii) Fast (1rtt commit -- any replica) - _Writes_: iv) Non-conflicting (i.e., never abort) ## Consistency and Properties Linearizable reads, writes and RMWs with the following properties: 1. _Writes_: from a live replica _always commit_ after Invalidating (and getting acknowledgments from) the rest live replicas. 1. _RMWs_: at most one of possible concurrent RMWs to a key can commit, and this only once all acknowledgments from live replicas are gathered. 1. _Reads_: return the local value if the targeted keys are found in the Valid state and the coordinator was considered live at the time of reading. The later can be ensured locally if the coordinator has a lease for (and is part of) the membership. ## Fault Tolerance Coupling Invalidations with per-key logical timestamps (i.e., Lamport clocks) and propagating the value to be updated with the invalidation message (_early value propagation_), Hermes allows any replica blocked by an update (write or RMW) to safely replay the update and unblock it self and the rest of followers. ---- ## Hardware dependencies A homogeneous cluster of x86_64 nodes interconnected via RDMA network cards and switched (tested on "Mellanox ConnectX-4" Infiniband infrastructure). ## Software requirements Linux OS (tested on Ubuntu 18.04 4.15.0-55-generic) with root access. The software is tested using the following version of Mellanox OFED RDMA drivers `MLNX_OFED_LINUX-4.4-2.0.7.0`. Third-party libraries that you will require to run the experiments include: 1. _parallel_ (Cluster management scripts only) 1. _libmemcached-dev_ (used to exchange QP informations for the setup of RDMA connections) 1. _libnuma-dev_ (for mbind) ## Setup On every node: 1. Install Mellanox OFED ibverbs drivers 1. `./hermes/bin/setup.sh` On manager (just pick on node in the cluster): 1. Fill variables in `/hermes/exec/hosts.sh` 1. Configure setup and default parameters in `/hermes/include/hermes/config.h` 1. From `/hermes/exec/` compile _hermesKV_ through make 1. scp _hermesKV_ and the configured hosts.sh in the `/hermes/exec/` directory of all other nodes in the cluster. ## Compilation `cd hermes/exec; make` _Warning_: Do not compile through cmake; instead use the Makefile in exec/ directory. ## Run Run first on manager: `./run-hermesKV.sh ` Then run on all other member nodes `./run-hermesKV.sh ` > Note that some members will eagerly terminate if experiment uses smaller number of nodes than specified in hosts.sh An experiment example for three nodes 12 worker threads and 35% write ratio would be as follows: `./run-hermesKV.sh -W 12 -w 350 -M 3` Supported command-line arguments for the experiments are detailed in the run-hermesKV.sh script. --- ## Acknowledgments Hermes is based on [HERD/MICA](https://github.com/efficient/HERD "Apache 2.0") design as an underlying KVS, the code of which we have adapted to implement HermesKV. ## Other Implementations of Hermes - [Odyssey](https://github.com/vasigavr1/Odyssey) - Hermes is also implemed in the Odyssey framework by [Vasilis Gavrielatos](https://github.com/vasigavr1) - [Olympus](https://github.com/sadraskol/olympus) - in Rust by [Thomas Bracher](https://twitter.com/sadraskol) ## Contact Antonios Katsarakis: `antonis.io` | [`antoniskatsarakis@yahoo.com`](mailto:antoniskatsarakis@yahoo.com?subject=[GitHub]%20Zeus%20Specification "Email") ================================================ FILE: bin/copy-exec-files.sh ================================================ #!/usr/bin/env bash FILES_TO_CPY=( "hosts.sh" "run.sh" "run-hermesKV.sh" "hermesKV" "run-rCRAQ.sh" "rCRAQ" # "hades" # "run-hades.sh" ) EXEC_FOLDER="${HOME}/hermes/exec" cd $EXEC_FOLDER # get Hosts source ../exec/hosts.sh make clean; make cd - for FILE in "${FILES_TO_CPY[@]}" do parallel scp ${EXEC_FOLDER}/${FILE} {}:${EXEC_FOLDER}/${FILE} ::: $(echo ${REMOTE_HOSTS[@]}) echo "${FILE} copied to {${REMOTE_HOSTS[@]}}" done ================================================ FILE: bin/copy-n-exec-hermesKV.sh ================================================ #!/usr/bin/env bash ### Runs to make #declare -a write_ratios=(0 10 50 200 500 1000) declare -a write_ratios=(1000) declare -a rmw_ratios=(0) #declare -a num_workers=(5 10 15 20 25 30 36) declare -a num_workers=(1) #declare -a batch_sizes=(25 50 75 100 125 150 200 250) declare -a batch_sizes=(50) declare -a credits=(50) #declare -a coalesce=(1 5 10 15) declare -a coalesce=(15) #declare -a num_machines=(2 3 5 7) declare -a num_machines=(5) # Set LAT_WORKER to -1 to disable latency measurement or to worker id (i.e., from 0 up to [num-worker - 1]) LAT_WORKER="-1" #LAT_WORKER="0" EXEC_FOLDER="${HOME}/hermes/exec" REMOTE_COMMAND="cd ${EXEC_FOLDER}; bash run-hermesKV.sh" PASS="${1}" if [ -z "$PASS" ] then echo "\$PASS is empty! --> sudo pass for remotes is expected to be the first arg" exit; fi echo "\$PASS is OK!" cd ${EXEC_FOLDER} # get Hosts source ./hosts.sh ../bin/copy-exec-files.sh # Execute locally and remotely for M in "${num_machines[@]}"; do for RMW in "${rmw_ratios[@]}"; do for WR in "${write_ratios[@]}"; do for W in "${num_workers[@]}"; do for BA in "${batch_sizes[@]}"; do for CRD in "${credits[@]}"; do for COAL in "${coalesce[@]}"; do args=" -M ${M} -R ${RMW} -w ${WR} -W ${W} -b ${BA} -c ${CRD} -C ${COAL} -l ${LAT_WORKER}" echo ${PASS} | ./run-hermesKV.sh ${args} & sleep 2 # give some leeway so that manager starts before executing the members parallel "echo ${PASS} | ssh -tt {} $'${REMOTE_COMMAND} ${args}'" ::: $(echo ${REMOTE_HOSTS[@]}) >/dev/null done done done done done done done cd - >/dev/null ../bin/get-system-xput-files.sh ================================================ FILE: bin/copy-n-exec-rCRAQ.sh ================================================ #!/usr/bin/env bash USE_SAME_BATCH_N_CREDITS=0 ### Runs to make declare -a write_ratios=(1000) #declare -a num_workers=(5 10 15 20 25 30 36) declare -a num_workers=(1) #declare -a batch_sizes=(25 50 75 100 125 150 200 250) declare -a batch_sizes=(50) declare -a credits=(15) # WARNING credits for CR must be divided by the num_machines (i.e., credits % num_machines == 0) #declare -a coalesce=(1 5 10 15) declare -a coalesce=(10) #declare -a num_machines=(2 3 5 7) declare -a num_machines=(3) # Set LAT_WORKER to -1 to disable latency measurement or to worker id (i.e., from 0 up to [num-worker - 1]) LAT_WORKER="-1" #LAT_WORKER="0" #LOCAL_HOST=`hostname` EXEC_FOLDER="${HOME}/hermes/exec" REMOTE_COMMAND="cd ${EXEC_FOLDER}; bash run-rCRAQ.sh" PASS="${1}" if [ -z "$PASS" ] then echo "\$PASS is empty! --> sudo pass for remotes is expected to be the first arg" exit; fi echo "\$PASS is OK!" cd ${EXEC_FOLDER} # get Hosts source ./hosts.sh ../bin/copy-exec-files.sh if [ ${USE_SAME_BATCH_N_CREDITS} -eq 0 ] then for M in "${num_machines[@]}"; do # Execute locally and remotely for WR in "${write_ratios[@]}"; do for W in "${num_workers[@]}"; do for BA in "${batch_sizes[@]}"; do for CRD in "${credits[@]}"; do for COAL in "${coalesce[@]}"; do args=" -M ${M} -w ${WR} -W ${W} -b ${BA} -c ${CRD} -C ${COAL} -l ${LAT_WORKER}" echo ${PASS} | ./run-rCRAQ.sh ${args} & sleep 2 parallel "echo ${PASS} | ssh -tt {} $'${REMOTE_COMMAND} ${args}'" ::: $(echo ${REMOTE_HOSTS[@]}) >/dev/null done done done done done done else # Execute locally and remotely for M in "${num_machines[@]}"; do for WR in "${write_ratios[@]}"; do for W in "${num_workers[@]}"; do for BA in "${batch_sizes[@]}"; do for COAL in "${coalesce[@]}"; do args=" -M ${M} -w ${WR} -W ${W} -b ${BA} -c ${BA} -C ${COAL} -l ${LAT_WORKER}" echo ${PASS} | ./run-rCRAQ.sh ${args} & sleep 2 parallel "echo ${PASS} | ssh -tt {} $'${REMOTE_COMMAND} ${args}'" ::: $(echo ${REMOTE_HOSTS[@]}) >/dev/null done done done done done fi cd - >/dev/null ../bin/get-system-xput-files.sh ================================================ FILE: bin/copy-traces.sh ================================================ #!/usr/bin/env bash # Copy (per-thread splitted) trace folder FOLDERS_TO_CPY=( "traces/current-splitted-traces" ) HOME_FOLDER="${HOME}/hermes" cd ${HOME_FOLDER} >/dev/null # get Hosts source ./exec/hosts.sh cd - >/dev/null for FOLDER in "${FOLDERS_TO_CPY[@]}" do parallel scp -r ${HOME_FOLDER}/${FOLDER} {}:${HOME_FOLDER}/${FOLDER} ::: $(echo ${REMOTE_HOSTS[@]}) echo "${FOLDER} copied to {${REMOTE_HOSTS[@]}}" done ================================================ FILE: bin/csv_latency_parser.py ================================================ #!/usr/bin/python import sys, os, ntpath, getopt """ ======== Parser for aggregated over time results ======== """ class LatencyParser: def __init__(self): self.latency_values = [] self.reads = [] self.max_read_latency = 0 self.max_write_latency = 0 self.writes = [] self.all_reqs = [] self.parseInputStats() self.printAllStats() # self.printStats(all_reqs) def printStats(self, array, max_latency): self.avgLatency(array) #self.percentileLatency(array, 20) self.percentileLatency(array, 50) self.percentileLatency(array, 90) self.percentileLatency(array, 95) self.percentileLatency(array, 99) #self.percentileLatency(array, 99.9) #self.percentileLatency(array, 99.99) #self.percentileLatency(array, 99.999) #self.percentileLatency(array, 99.9999) #self.percentileLatency(array, 100) print "Max Latency: ", max_latency, "us" def printAllStats(self): print "~~~~~~ Write Stats ~~~~~~~" self.printStats(self.writes, self.max_write_latency) print "\n~~~~~~ Read Stats ~~~~~~~~" self.printStats(self.reads, self.max_read_latency) print "\n~~~~~~ Overall Stats ~~~~~~~~~" self.printStats(self.all_reqs, max(self.max_read_latency, self.max_write_latency)) def avgLatency(self, array): cummulative = 0 total_reqs = 0 for x in xrange(len(self.latency_values)): cummulative = self.latency_values[x] * array[x] + cummulative total_reqs += array[x] if total_reqs > 0: print "Reqs measured: ", total_reqs, "| Avg Latency: ", cummulative / total_reqs else: print "No reqs measured" def percentileLatency(self, array, percentage): total_reqs = 0 sum_reqs = 0 for x in xrange(len(self.latency_values)): #cummulative = self.latency_values[x] * array[x] + cummulative total_reqs += array[x] if total_reqs > 0: if percentage == 100: for x in reversed(xrange(len(self.latency_values))): if array[x] > 0: if self.latency_values[x] == -1: print percentage, "%: >", self.latency_values[x-1], "us" else: print percentage, "%: ", self.latency_values[x], "us" return else: for x in xrange(len(self.latency_values)): sum_reqs += array[x] if ((100.0 * sum_reqs) / total_reqs) >= percentage: if self.latency_values[x] == -1: print percentage, "%: >", self.latency_values[x-1], "us" else: print percentage, "% : ", self.latency_values[x], "us" return else: print "No reqs measured" def parseInputStats(self): lr_lines = 0 for line in sys.stdin: # input from standard input if line[0] == '#': continue (command, words) = line.strip().split(":",1) command = command.strip() if command == 'reads': words = words.strip().split(",") #if int(words[0].strip()) != -1: self.latency_values.append(int(words[0].strip())) self.reads.append(int(words[1].strip())) self.all_reqs.append(int(words[1].strip())) elif command == 'writes': words = words.strip().split(",") self.writes.append(int(words[1].strip())) self.all_reqs[lr_lines] = self.all_reqs[lr_lines] + self.writes[-1] lr_lines = lr_lines + 1 elif command == 'reads-hl': words = words.strip().split(",") self.max_read_latency = int(words[0].strip()) elif command == 'writes-hl': words = words.strip().split(",") self.max_write_latency = int(words[0].strip()) if __name__ == '__main__': LatencyParser() ================================================ FILE: bin/exec-derecho.sh ================================================ #!/usr/bin/env bash HOSTS=( ##### network cluster ##### "houston" "sanantonio" "austin" "indianapolis" "philly" # "atlanta" ##### compute cluster ##### # "baltimore" # "chicago" # "detroit" ) NUM_NODES=5 NUM_SENDERS=0 #0 - all senders, 1 - half senders, 2 - one sender REQS_PER_SENDER=10000000 ### Runs to make #declare -a delivery_mode=(0 1) #0 - ordered mode, 1 - unordered mode #declare -a object_size=(40 1024) #declare -a window_size=(128 256) declare -a delivery_mode=(0) #0 - ordered mode, 1 - unordered mode declare -a object_size=(256 1024) declare -a window_size=(128 256) declare -a iterations=(1 2 3 4) #(1 2 3) for 3 iterations if [[ $NUM_NODES -ne ${#HOSTS[@]} ]] ; then echo "Num_nodes($NUM_NODES) != #Hosts(${#HOSTS[@]})" exit 1 fi LOCAL_HOST=`hostname` HOME_FOLDER="${HOME}/derecho-unified/Release/applications/tests/performance_tests/" #pin derecho threads to cores (w/o using hyperthreads) of numa node 0 COMMAND_NO_ARGS="taskset -c 0,2,4,6,8,10,12,14,16,18 ./bandwidth_test " total_iters=0 cd ${HOME_FOLDER} >/dev/null # Execute locally and remotely for del_mode in "${delivery_mode[@]}"; do for obj_size in "${object_size[@]}"; do for win_size in "${window_size[@]}"; do for iter in "${iterations[@]}"; do total_iters=$((total_iters + 1)) args="--DERECHO/max_payload_size=${obj_size} --DERECHO/window_size=${win_size} -- ${NUM_NODES} ${NUM_SENDERS} ${REQS_PER_SENDER} ${del_mode}" COMMAND=" ${COMMAND_NO_ARGS} ${args}" echo "Running Derecho with: delivery_mode:${del_mode} obj size: $obj_size, window_size: $win_size nodes: $NUM_NODES " ${COMMAND} >/dev/null & sleep 1 parallel "ssh -tt {} $'cd ${HOME_FOLDER}; ${COMMAND}'" ::: $(echo ${HOSTS[@]/$LOCAL_HOST}) >/dev/null sleep 9 # give local node some leeway to log the results into a file done done done done tail -${total_iters} data_derecho_bw cd - >/dev/null ================================================ FILE: bin/format.sh ================================================ #!/bin/bash SCRIPT_DIR="$(dirname "$0")" cd "${SCRIPT_DIR}" FORMAT_FILES_IN_DIRECTORIES="../src/ ../include/" clang-format --version > /dev/null || exit 1 if [ "$1" = "check" ]; then # Check clang-format has been applied! find ${FORMAT_FILES_IN_DIRECTORIES} \ -regex '.*\.\(cpp\|hpp\|cc\|cxx\)' \ -exec clang-format -style=file -output-replacements-xml -i {} \; | grep -c "/dev/null if [ $? -ne 1 ]; then echo "Format check: Failed!" echo " -- Files do not match clang-format. Run bin/format.sh before adding files to git!" exit 1 else echo "Format check: Passed!" fi else # Apply clang-format to all files find ${FORMAT_FILES_IN_DIRECTORIES} \ -regex '.*\.\(c\|h\|cpp\|hpp\|cc\|cxx\)' \ -exec clang-format -style=file -i {} \; fi ================================================ FILE: bin/get-system-xput-files.sh ================================================ #!/usr/bin/env bash EXEC_FOLDER="${HOME}/hermes/exec" RESULTS_FOLDER="${HOME}/hermes/exec/results" RESULT_FOLDER="${RESULTS_FOLDER}/xput/per-node/" RESULT_OUT_FOLDER="${RESULTS_FOLDER}/xput/per-node/" RESULT_OUT_FOLDER_MERGE="${RESULTS_FOLDER}/xput/all-nodes/" cd ${EXEC_FOLDER} >/dev/null # get Hosts source ./hosts.sh cd - >/dev/null # Gather remote files parallel "scp {}:${RESULT_FOLDER}* ${RESULT_OUT_FOLDER} " ::: $(echo ${REMOTE_HOSTS[@]}) echo "xPut result files copied from: {${REMOTE_HOSTS}}" # group all files ls ${RESULT_OUT_FOLDER} | awk -F '-' '!x[$2]++{print $1}' | while read -r line; do # Create an intermediate file print the 3rd line for all files with the same prefix to the same file awk 'FNR==3 {print $0}' ${RESULT_OUT_FOLDER}/$line* > ${RESULT_OUT_FOLDER_MERGE}/$line-inter.txt # Sum up the xPut of the (3rd iteration) from every node to create the final file awk -F ':' '{sum += $2} END {print sum}' ${RESULT_OUT_FOLDER_MERGE}/$line-inter.txt > ${RESULT_OUT_FOLDER_MERGE}/$line.txt rm -rf ${RESULT_OUT_FOLDER_MERGE}/$line-inter.txt done echo "System-wide xPut results produced in ${RESULT_OUT_FOLDER_MERGE} directory!" ================================================ FILE: bin/setup.sh ================================================ #!/usr/bin/env bash # Exec this script in every cluster node after you have # installed the (Infiniband) Verbs drivers through Mellanox OFED: # 1. Download the MLNX_OFED (tested on --> MLNX_OFED_LINUX-4.4-2.0.7.0-ubuntu18.04-x86_64) # https://www.mellanox.com/page/products_dyn?product_family=26 # 2. tar -xvf the tar file # 3. install through --> sudo ./mlnxofedinstall if ! [ -x "$(command -v ofed_info)" ]; then echo "Error: mellanox ofed is not installed." >&2 echo " Please install the (Infiniband) Verbs drivers through Mellanox OFED by:" echo " 1. Download the MLNX_OFED (tested on --> MLNX_OFED_LINUX-4.4-2.0.7.0-ubuntu18.04-x86_64)" echo " https://www.mellanox.com/page/products_dyn?product_family=26" echo " 2. tar -xvf the tar file" echo " 3. install through --> sudo ./mlnxofedinstall" exit 1 else MLNX_OFED_VERSION=`ofed_info | head -1` echo "Running OFED driver version: ${MLNX_OFED_VERSION}" >&2 fi # Install required Libraries (memcached is used to setup RDMA connection and numa for mbind) sudo apt --yes install libmemcached-dev libnuma-dev memcached # start a subnet manager sudo /etc/init.d/opensmd start # there must be at least one subnet-manager in an infiniband subnet cluster # start the driver sudo /etc/init.d/openibd start # Configure (2MB) huge-pages for the KVS # Note that such a huge page allocation is not permanent and must be re-applied after a node reboot. #echo 8192 | sudo tee /sys/devices/system/node/node*/hugepages/hugepages-2048kB/nr_hugepages echo 4096 | sudo tee /sys/devices/system/node/node*/hugepages/hugepages-2048kB/nr_hugepages echo 10000000001 | sudo tee /proc/sys/kernel/shmmax echo 10000000001 | sudo tee /proc/sys/kernel/shmall ================================================ FILE: bin/trace-spliter.sh ================================================ #!/usr/bin/env bash INPUT_DIR="${HOME}/hermes/traces/system-traces/" INPUT_FILENAME="simple_trace_w_100000000_k_1000000_a_0.99.txt" OUTPUT_DIR="${HOME}/hermes/traces/current-splited-traces/" OUTPUT_PREFIX="t_" OUTPUT_SUFFIX="_a_0.99.txt" MAX_NUM_NODES=10 MAX_THREADS_PER_NODE=40 CHUNKS=$(expr ${MAX_NUM_NODES} \* ${MAX_THREADS_PER_NODE}) LINES=$(wc -l ${INPUT_DIR}/${INPUT_FILENAME} | cut -d ' ' -f1) echo "Splitting trace with $LINES lines into $CHUNKS (per-thread) chunks ..." split -l $(expr ${LINES} / ${CHUNKS}) \ -a 4 -d \ --additional-suffix=${OUTPUT_SUFFIX} \ ${INPUT_DIR}/${INPUT_FILENAME} \ ${OUTPUT_DIR}/${OUTPUT_PREFIX} ================================================ FILE: exec/Makefile ================================================ CPPFLAGS := -O3 #-Wno-unused-result -Wall -Werror LD := gcc -O3 -flto LDFLAGS := ${LDFLAGS} -libverbs -lrt -lpthread -lmemcached -lnuma # -lrdmacm --> TODO we do not use hw multicast because it helps only on master-based patterns CFLAGS = -I../include/mica-herd -I../include/hermes -I../include/wings -I../include/hades APPS := hermesKV rCRAQ PROF := -g -fno-omit-frame-pointer all: ${APPS} clean-o hermesKV: ../src/wings/wings.o ../src/hades/hades.o \ ../src/mica-herd/herd.o ../src/mica-herd/mica.o ../src/mica-herd/city.o \ ../src/hermes/main.o ../src/hermes/hermes_worker.o ../src/hermes/util.o \ ../src/hermes/stats.o ../src/hermes/spacetime.o ../src/hermes/hermesKV.o ${LD} -o $@ $^ ${LDFLAGS} rCRAQ: ../src/mica-herd/herd.o ../src/mica-herd/mica.o \ ../src/mica-herd/city.o ../src/hermes/main.o ../src/CR/cr_worker.o ../src/CR/crKV.o \ ../src/hermes/spacetime.o ../src/hermes/util.o ../src/hermes/stats.o ../src/wings/wings.o ${LD} -o $@ $^ ${LDFLAGS} hades-exec: ../src/hades/hades.o ../src/hades/test.o ../src/wings/wings.o ../src/mica-herd/herd.o ${LD} -o hades $^ ${LDFLAGS} hades: hades-exec clean-o PHONY: clean clean: @rm -f ../src/hermes/*.o ../src/mica-herd/*.o ../src/wings/*.o \ ../src/CR/*.o ../src/hades/*.o ${APPS} hades clean-o: @rm -f ../src/hermes/*.o ../src/mica-herd/*.o ../src/wings/*.o \ ../src/CR/*.o ../src/hades/*.o ================================================ FILE: exec/hosts.sh ================================================ #!/usr/bin/env bash ALL_IPS=( ### TO BE FILLED: Please provide all cluster IPs # Node w/ first IP (i.e., "manager") must run script before the rest of the nodes # (instantiates a memcached to setup RDMA connections) # 10.0.3.1 10.0.3.2 10.0.3.3 10.0.3.4 10.0.3.5 ) ### TO BE FILLED: Modify to get the local IP of the node running the script (must be one of the cluster nodes) LOCAL_IP=$(ip addr | grep 'state UP' -A2 | grep 'inet 10.0.3'| awk '{print $2}' | cut -f1 -d'/') #LOCAL_IP="129.215.164.2" ### Fill the RDMA device name (the "hca_id" of the device when executing ibv_devinfo) #NET_DEVICE_NAME="mlx5_0" NET_DEVICE_NAME="mlx4_0" ########################################## ### NO NEED TO CHANGE BELOW THIS POINT ### ########################################## REMOTE_IPS=${ALL_IPS[@]/$LOCAL_IP} REMOTE_HOSTS=${ALL_IPS[@]/$LOCAL_IP} NODE_ID=-1 for i in "${!ALL_IPS[@]}"; do if [ "${ALL_IPS[i]}" == "$LOCAL_IP" ]; then NODE_ID=$i fi done if [[ ${NODE_ID} == -1 ]]; then echo "Error Local IP: ${LOCAL_IP} n is not in ALL_IPS:" echo " {${ALL_IPS[@]}}" exit fi echo "Local node id:" ${NODE_ID} ================================================ FILE: exec/results/latency/.gitinclude ================================================ ================================================ FILE: exec/results/xput/all-nodes/.gitkeep ================================================ ================================================ FILE: exec/results/xput/per-node/.gitkeep ================================================ ================================================ FILE: exec/run-hades.sh ================================================ #!/usr/bin/env bash source run.sh blue "Running hades" sudo LD_LIBRARY_PATH=/usr/local/lib/ -E \ ./hades \ --machine-id ${NODE_ID} \ --dev-name ${NET_DEVICE_NAME} \ 2>&1 ================================================ FILE: exec/run-hermesKV.sh ================================================ #!/usr/bin/env bash source run.sh #### Get CLI arguments # Use -1 for the default (#define in config.h) values if not argument is passed CREDITS="-1" NUM_WORKERS="-1" WRITE_RATIO="-1" MAX_COALESCE="-1" MAX_BATCH_SIZE="-1" RMW_RATIO="-1" NUM_MACHINES="-1" LAT_WORKER="-1" # Each letter is an option argument, if it's followed by a collum # it requires an argument. The first colum indicates the '\?' # help/error command when no arguments are given while getopts ":W:w:l:R:C:c:b:M:h" opt; do case $opt in W) NUM_WORKERS=$OPTARG # Number of threads: this must be smaller than MAX_WORKERS_PER_MACHINE of config.h ;; w) WRITE_RATIO=$OPTARG # given number is divided by 10 to give write rate % (i.e., 55 means 5.5 % writes) ;; R) RMW_RATIO=$OPTARG # percentage of writes to be rmws (i.e., -w 500 -R 500 means 25 % of RMWs and 25% of writes) # RMW is disabled by default (no usage through the artifact) can be enabled through config.h) ;; C) MAX_COALESCE=$OPTARG # maximum number of readily-available messages to be "batched" in a network packet # must be smaller than MTU and it is capped by MAX_REQ_COALESCE in config.h ;; c) CREDITS=$OPTARG # maximum number of credits per node per thread; credits correspond to messages and not packets # it is capped by MAX_CREDITS_PER_REMOTE_WORKER in config.h ;; b) MAX_BATCH_SIZE=$OPTARG # amount of requests and protocol messages that can be batched to the KVS # it is capped by MAX_BATCH_KVS_OPS_SIZE in config.h ;; M) NUM_MACHINES=$OPTARG # it is capped by MAX_MACHINE_NUM in config.h and the number of IPS as indicated in hosts.sh ;; l) LAT_WORKER=$OPTARG # An id of the worker who is measuring the latency # if -1 Latency is disabled # otherwise it is capped by running worker threads (NUM_WORKERS-1) ;; h) echo "Usage: -W <# workers> -w (x1000 --> 10 for 1%)" echo " -c <# credits> -b -C " echo " -M <# nodes> -l -R " exit 1 ;; \?) echo "Invalid option: -$OPTARG use -h to get info for arguments" >&2 exit 1 ;; :) echo "Option -$OPTARG requires an argument." >&2 exit 1 ;; esac done blue "Running hermes threads" sudo LD_LIBRARY_PATH=/usr/local/lib/ -E \ ./hermesKV \ --machine-id ${NODE_ID} \ --is-roce 0 \ --dev-name ${NET_DEVICE_NAME} \ --num-machines ${NUM_MACHINES} \ --num-workers ${NUM_WORKERS} \ --lat-worker ${LAT_WORKER} \ --rmw-ratio ${RMW_RATIO} \ --write-ratio ${WRITE_RATIO} \ --credits ${CREDITS} \ --max-coalesce ${MAX_COALESCE} \ --max-batch-size ${MAX_BATCH_SIZE} \ --hermes \ 2>&1 ================================================ FILE: exec/run-rCRAQ.sh ================================================ #!/usr/bin/env bash source run.sh #### Get CLI arguments # Use -1 for the default (#define in config.h) values if not argument is passed CREDITS="-1" NUM_WORKERS="-1" WRITE_RATIO="-1" MAX_COALESCE="-1" MAX_BATCH_SIZE="-1" RMW_RATIO="-1" NUM_MACHINES="-1" LAT_WORKER="-1" # Each letter is an option argument, if it's followed by a collum # it requires an argument. The first colum indicates the '\?' # help/error command when no arguments are given while getopts ":W:w:C:c:b:M:l:h" opt; do case $opt in W) NUM_WORKERS=$OPTARG ;; w) WRITE_RATIO=$OPTARG ;; C) MAX_COALESCE=$OPTARG ;; c) CREDITS=$OPTARG ;; b) MAX_BATCH_SIZE=$OPTARG ;; M) NUM_MACHINES=$OPTARG ;; l) LAT_WORKER=$OPTARG ;; h) echo "Usage: -W <# workers> -w (x1000 --> 10 for 1%)" echo " -c <# credits> -b -C " echo " -M <# nodes> -l " exit 1 ;; \?) echo "Invalid option: -$OPTARG use -h to get info for arguments" >&2 exit 1 ;; :) echo "Option -$OPTARG requires an argument." >&2 exit 1 ;; esac done blue "Running hermes threads" sudo LD_LIBRARY_PATH=/usr/local/lib/ -E \ ./rCRAQ \ --machine-id ${NODE_ID} \ --is-roce 0 \ --dev-name ${NET_DEVICE_NAME} \ --num-machines ${NUM_MACHINES} \ --num-workers ${NUM_WORKERS} \ --lat-worker ${LAT_WORKER} \ --rmw-ratio ${RMW_RATIO} \ --write-ratio ${WRITE_RATIO} \ --credits ${CREDITS} \ --max-coalesce ${MAX_COALESCE} \ --max-batch-size ${MAX_BATCH_SIZE} \ 2>&1 ================================================ FILE: exec/run.sh ================================================ #!/usr/bin/env bash source ./hosts.sh export HRD_REGISTRY_IP="${ALL_IPS[0]}" # I.E. first IP node (HOUSTON) has a memcached server (used to initialize RDMA QPs) export MLX5_SINGLE_THREADED=1 export MLX5_SCATTER_TO_CQE=1 sudo killall memcached sudo killall hades sudo killall rCRAQ sudo killall hermesKV # A function to echo in blue color function blue() { es=`tput setaf 4` ee=`tput sgr0` echo "${es}$1${ee}" } #### free the pages workers use blue "Removing SHM keys used by HermesKV/rCRAQ" for i in `seq 0 28`; do key=`expr 3185 + $i` sudo ipcrm -M $key 2>/dev/null key=`expr 4185 + $i` sudo ipcrm -M $key 2>/dev/null done : ${HRD_REGISTRY_IP:?"Need to set HRD_REGISTRY_IP non-empty"} blue "Reset server QP registry" memcached -l ${HRD_REGISTRY_IP} 1>/dev/null 2>/dev/null & sleep 1 ================================================ FILE: include/hades/hades.h ================================================ // // Created by akatsarakis on 17/01/19. // #ifndef HADES_H #define HADES_H #include "../../include/wings/wings.h" #include "../utils/bit_vector.h" #include "../utils/time_rdtsc.h" // Send heartbeats // Recv heartbeats // Change View // Update local membership // (Ostracism) // arbitration --> a node provides an obolus // all nodes are able to communicate w/ each other // fd provides a view as a membership change // only as long as it differs with the current view // and agrees with a majority of other node views. // The update granularity of local view works as a lease // to membership changes which prevents sequentially // consistent reads in the presence of network partitions // I.E. a node in a minority partition is able to detect // that cannot reach the majority of nodes and stops serving // local reads, maintaining linearizability (instead of sequential // consistency) For this // Epochs // Guarantees Nodes in the same EPOCH id have the same group view #define ENABLE_ARBITRATION 1 // Hades debug Tests #define FAKE_LINK_FAILURE 0 #define FAKE_LINK_FAILURE_AFTER_SEC 15 #define STOP_FAKE_LINK_FAILURE_AFTER_SEC 20 #define FAKE_ONE_WAY_LINK_FAILURE 0 #define FAKE_LINK_FAILURE_NODE_A 2 #define FAKE_LINK_FAILURE_NODE_B 1 static_assert(FAKE_LINK_FAILURE_NODE_A != FAKE_LINK_FAILURE_NODE_B, ""); typedef struct { uint8_t node_id : 8; uint8_t epoch_id : 8; uint8_t same_w_local_membership : 1; uint8_t have_ostracised_for_dst_node : 7; bit_vector_t view; } __attribute__((packed)) hades_view_t; static_assert(sizeof(hades_view_t) <= 4, "Currently send using a 4B header only field (RDMA immediate)"); typedef struct { hades_view_t last_local_view; hades_view_t intermediate_local_view; bit_vector_t curr_g_membership; uint8_t nodes_in_membership; uint8_t max_num_nodes; uint8_t* recved_views_flag; hades_view_t* remote_recved_views; // Polling uint16_t max_views_to_poll; hades_view_t* poll_buff; // used for polling remote views // Timing uint32_t send_view_every_us; uint32_t update_local_view_every_ms; struct timespec* ts_last_send; // issues views to remotes iff have not send a // view within the predefined timeout struct timespec ts_last_view_change; // update views and possible changes membership iff // pre-defined timeout is exceed // Ostracism uint8_t* have_ostracized_for; // an array storing info whether or not in a view // the sender ostracized someone for this node } hades_ctx_t; typedef struct { hades_ctx_t ctx; ud_channel_t* hviews_c; ud_channel_t* hviews_crd_c; } hades_wings_ctx_t; void* hades_full_thread(void* node_id); uint16_t poll_for_remote_views(hades_wings_ctx_t* hw_ctx); void update_view_and_issue_hbs(hades_wings_ctx_t* hw_ctx); inline static void hades_ctx_init(hades_ctx_t* ctx, uint8_t node_id, uint8_t max_nodes, uint16_t max_views_to_poll, uint32_t send_view_us, uint32_t update_local_view_ms) { assert(max_views_to_poll > 0); ctx->intermediate_local_view.epoch_id = 0; ctx->intermediate_local_view.node_id = node_id; ctx->nodes_in_membership = 1; bv_init(&ctx->curr_g_membership); bv_bit_set(&ctx->curr_g_membership, node_id); bv_init(&ctx->intermediate_local_view.view); bv_bit_set(&ctx->intermediate_local_view.view, node_id); ctx->last_local_view = ctx->intermediate_local_view; ctx->max_num_nodes = max_nodes; ctx->recved_views_flag = malloc(sizeof(uint8_t) * max_nodes); ctx->remote_recved_views = malloc(sizeof(hades_view_t) * max_nodes); for (int i = 0; i < max_nodes; ++i) { ctx->recved_views_flag[i] = 0; bv_init(&ctx->remote_recved_views[i].view); } ctx->max_views_to_poll = max_views_to_poll; ctx->poll_buff = malloc(sizeof(hades_view_t) * max_views_to_poll); // Setup timers init_rdtsc(1, 0); /// WARNING: this is not thread safe!! get_rdtsc_timespec(&ctx->ts_last_view_change); ctx->ts_last_send = malloc(sizeof(struct timespec) * max_nodes); for (int i = 0; i < max_nodes; ++i) get_rdtsc_timespec(&ctx->ts_last_send[i]); ctx->send_view_every_us = send_view_us; ctx->update_local_view_every_ms = update_local_view_ms; assert(2 * 1000 * update_local_view_ms > send_view_us); // Ostracism ctx->have_ostracized_for = malloc(sizeof(uint8_t) * max_nodes); for (int i = 0; i < max_nodes; ++i) ctx->have_ostracized_for[i] = 0; } // WARNING: hades wings_ctx_init initializes only the first part of the // required channels wings_setup_channel_qps_and_recvs must be called by // the application afterwards to finish the initialization of wings. inline static void hades_wings_ctx_init(hades_wings_ctx_t* wctx, uint8_t node_id, uint8_t max_nodes, uint16_t max_views_to_poll, uint32_t send_view_us, uint32_t update_local_view_ms, ud_channel_t* hviews_c, ud_channel_t* hviews_crd_c, uint16_t worker_lid) { hades_ctx_init(&wctx->ctx, node_id, max_nodes, max_views_to_poll, send_view_us, update_local_view_ms); wctx->hviews_c = hviews_c; wctx->hviews_crd_c = hviews_crd_c; const uint8_t is_bcast = 0; const uint8_t stats_on = 1; const uint8_t prints_on = 1; const uint8_t is_hdr_only = 1; const uint8_t expl_crd_ctrl = 1; const uint8_t enable_inlining = 1; const uint8_t disable_crd_ctrl = 0; const uint8_t credits = (const uint8_t)(2 * update_local_view_ms * 1000 / send_view_us); char qp_name[200]; sprintf(qp_name, "%s%d", "\033[1m\033[32mHades\033[0m", worker_lid); wings_ud_channel_init( wctx->hviews_c, qp_name, REQ, 1, sizeof(hades_view_t) - sizeof(uint8_t), 0, enable_inlining, is_hdr_only, is_bcast, disable_crd_ctrl, expl_crd_ctrl, wctx->hviews_crd_c, credits, max_nodes, (uint8_t)machine_id, stats_on, prints_on); } // How does somebody joins? // epoch id 0 // must see at least a majority of views with same epoch id > 0 // || majority of views with epoch id 0 #endif // HADES_H ================================================ FILE: include/hermes/config.h ================================================ // // Created by akatsarakis on 15/03/18. // #ifndef SPACETIME_CONFIG_H #define SPACETIME_CONFIG_H #include #include #include "sizes.h" // MAX_ defines are treated as DEFAULT_ as well (i.e., if not altered by CLI // args) /*------------------------------------------------- ------------ SETUP & DEFAULT SETTINGS ------------- --------------------------------------------------*/ #define MAX_MACHINE_NUM 5 // maximum nodes #define MAX_WORKERS_PER_MACHINE 15 // maximum number of threads per node #define DEFAULT_WORKERS_PER_MACHINE 2 #define DEFAULT_THREAD_OF_STAT_THREAD \ (15) // WARNING make sure this is not co-located with a worker thread // Number of sockets (numa nodes), cores and h/w threads per core on each node #define TOTAL_THREADS_PER_CORE 2 #define TOTAL_CORES_PER_SOCKET 10 #define TOTAL_NUMBER_OF_SOCKETS 2 /*------------------------------------------------- ------------------------------------------------- ------------------------------------------------- -------- No need to change beyond this point ---- ------------------------------------------------- ------------------------------------------------- --------------------------------------------------*/ // Default workload writes / updates accesses (the rest are reads) #define DEFAULT_UPDATE_RATIO 1000 // is divided by 10 (i.e., 25 --> 2.5 %) // both writes and RMWs (RMW_RATIO inderectly provides WRITE_RATIO) #define ENABLE_RMWs \ 0 // if RMWs is not enabled then all UPDATE_RATIO == WRITE_RATIO #define DEFAULT_RMW_RATIO 0 // is divided by 10 (i.e., 25 --> 2.5 %) // percentage of UPDATE_RATIO to be RMWs // Max operations per-thread to batches to the KVS (either received packets or // read/write/RMW requests) #define MAX_BATCH_KVS_OPS_SIZE 250 static_assert(MAX_WORKERS_PER_MACHINE <= 254, ""); static_assert(MAX_WORKERS_PER_MACHINE <= TOTAL_NUMBER_OF_SOCKETS * TOTAL_THREADS_PER_CORE * TOTAL_CORES_PER_SOCKET, ""); static_assert(DEFAULT_UPDATE_RATIO <= 1000 && DEFAULT_RMW_RATIO >= 0, ""); /*------------------------------------------------- ----------------- RDMA SETTINGS ------------------- --------------------------------------------------*/ // Request coalescing (max --readily available-- messages to batch in a single // RDMA packet) #define MAX_REQ_COALESCE 15 // Flow control #define MAX_CREDITS_PER_REMOTE_WORKER (MAX_REQ_COALESCE) // Request inlining #define DISABLE_INLINING 0 /*------------------------------------------------- ----------------- SECONDARY SETTINGS -------------- --------------------------------------------------*/ // LATENCY #define DEFAULT_MEASURE_LATENCY 0 #define DEFAULT_WORKER_MEASURING_LATENCY 0 #define MAX_LATENCY 1000 // in us #define LATENCY_BUCKETS 1000 #define LATENCY_PRECISION \ (MAX_LATENCY / LATENCY_BUCKETS) // latency granularity in us // FAIRNESS #define ENABLE_VIRTUAL_NODE_IDS 0 // 0 #define VIRTUAL_NODE_IDS_PER_NODE 20 // SKEW #define ENABLE_COALESCE_OF_HOT_REQS \ 0 // 0 //WARNING!!! this must be disabled for cr #define COALESCE_N_HOTTEST_KEYS 100 #define ENABLE_READ_COMPLETE_AFTER_VAL_RECV_OF_HOT_REQS 0 // 1 #define ENABLE_WRITE_COALESCE_TO_THE_SAME_KEY_IN_SAME_NODE 0 // DEBUG #define ENABLE_ASSERTIONS 0 #define DISABLE_VALS_FOR_DEBUGGING 0 #define KEY_NUM 0 // use 0 to disable // REQUESTS #define FEED_FROM_TRACE 0 #define ZIPF_EXPONENT_OF_TRACE \ 99 // if FEED_FROM_TRACE == 1 | this is divided by 100 (e.g. use 99 for a = // 0.99) #define NUM_OF_REP_REQS K_256 // if FEED_FROM_TRACE == 0 #define USE_A_SINGLE_KEY 0 // if FEED_FROM_TRACE == 0 #define ST_KEY_ID_255_OR_HIGHER 255 /*------------------------------------------------- ---------------- Debug and others ----------------- --------------------------------------------------*/ // DBG Prints /// Warning some prints assume that there are no faults (multiplications with /// REMOTE_MACHINES) #define MAX_THREADS_TO_PRINT 1 #define ENABLE_REQ_PRINTS 0 #define ENABLE_BATCH_OP_PRINTS 0 #define ENABLE_INV_PRINTS 0 #define ENABLE_ACK_PRINTS 0 #define ENABLE_VAL_PRINTS 0 // Stats prints #define PRINT_STATS_EVERY_MSECS 4000 // 5000 //10000 //10 #define PRINT_WORKER_STATS 0 // Stats #define EXIT_ON_STATS_PRINT 1 #define PRINT_NUM_STATS_BEFORE_EXITING 5 #define DUMP_XPUT_STATS_TO_FILE 1 // FAILURE DETECTION (RM) #define ENABLE_HADES_FAILURE_DETECTION 0 #define WORKER_WITH_FAILURE_DETECTOR 0 static_assert(ENABLE_HADES_FAILURE_DETECTION == 0, "WARNING HADES is currently not working"); // FAKE NODE FAILURE #define FAKE_FAILURE 0 #define NODE_TO_FAIL 2 #define ROUNDS_BEFORE_FAILURE 2 // Rarely (or never) change #define BASE_SHM_KEY 24 #define WORKER_SL 0 // service level for the workers #define MAX_REMOTE_MACHINES (MAX_MACHINE_NUM - 1) #define HERMES_CEILING(x, y) (((x) + (y)-1) / (y)) #define GROUP_MEMBERSHIP_ARRAY_SIZE \ HERMES_CEILING(MAX_MACHINE_NUM, 8) // assuming uint8_t #define TOTAL_HW_CORES \ (TOTAL_THREADS_PER_CORE * TOTAL_CORES_PER_SOCKET * TOTAL_NUMBER_OF_SOCKETS) static_assert(MAX_WORKERS_PER_MACHINE < TOTAL_HW_CORES - 1, "Leave at least a hw thread free for OS etc.."); #define KV_SOCKET 0 // socket to allocate KVS (huge-)pages #define USE_ALL_SOCKETS 1 #define ENABLE_HYPERTHREADING 1 #define SOCKET_TO_START_SPAWNING_THREADS 0 // Debug //#define SPACETIME DEBUG 2 #ifndef SPACETIME_DEBUG #define SPACETIME_DEBUG 0 #endif //////////////////////////////// /// Hermes NOT TUNABLE //////////////////////////////// /*------------------------------------------------- ----------------- MAX HERMES OPS SIZE ------------- --------------------------------------------------*/ #define MAX_MSG_RECV_OPS_SIZE \ (MAX_CREDITS_PER_REMOTE_WORKER * MAX_REMOTE_MACHINES * MAX_REQ_COALESCE) #define HERMES_MAX_BATCH_SIZE MAX(MAX_MSG_RECV_OPS_SIZE, MAX_BATCH_KVS_OPS_SIZE) /*------------------------------------------------- ---------------- QPs Numbers ---------------------- --------------------------------------------------*/ typedef enum { INV_UD_QP_ID = 0, ACK_UD_QP_ID, VAL_UD_QP_ID, CRD_UD_QP_ID, END_HERMES_QPS_ENUM } hermes_qps_enum; // QPs #define TOTAL_WORKER_UD_QPs END_HERMES_QPS_ENUM #define TOTAL_WORKER_N_FAILURE_DETECTION_UD_QPs \ (TOTAL_WORKER_UD_QPs + (ENABLE_HADES_FAILURE_DETECTION ? 2 : 0)) /*------------------------------------------------- ----------------- CR CONFIGURATION ---------------- --------------------------------------------------*/ #define CR_ENABLE_REMOTE_READS 0 #define CR_REMOTE_READS_CREDITS 20 #define MAX_CREDITS_PER_REMOTE_WORKER_CR 250 //(MAX_BATCH_KVS_OPS_SIZE) // CR #define CR_ACK_CREDITS (255) // //(MAX_MACHINE_NUM * 255) #define CR_ENABLE_EARLY_INV_CRDS \ 1 // optimization to increase request pipelining typedef enum { CR_INV_UD_QP_ID = 0, #ifdef CR_ENABLE_EARLY_INV_CRDS CR_INV_CRD_UD_QP_ID, #endif CR_ACK_UD_QP_ID, CR_REMOTE_WRITES_UD_QP_ID, CR_REMOTE_WRITE_CRD_UD_QP_ID, CR_REMOTE_READS_UD_QP_ID, CR_REMOTE_READS_RESP_UD_QP_ID } cr_qps_enum; #define CR_TOTAL_WORKER_UD_QPs \ (TOTAL_WORKER_UD_QPs + (CR_ENABLE_REMOTE_READS ? 2 : 0) + \ (CR_ENABLE_EARLY_INV_CRDS ? 1 : 0)) // Max CR batch op size #define MAX_MSG_RECV_OPS_SIZE_CR \ (MAX_REQ_COALESCE * MAX_CREDITS_PER_REMOTE_WORKER_CR * MAX_REMOTE_MACHINES) #define CR_MAX_BATCH_SIZE MAX(MAX_MSG_RECV_OPS_SIZE_CR, MAX_BATCH_KVS_OPS_SIZE) // CR DEBUG #define CR_ENABLE_ONLY_HEAD_REQS 0 #define CR_ENABLE_ALL_NODES_GETS_EXCEPT_HEAD 0 #define CR_ENABLE_BLOCKING_INVALID_WRITES_ON_HEAD 0 /*------------------------------------------------- ----------------- Global Vars --------------------- --------------------------------------------------*/ struct thread_params { int id; }; struct latency_counters { uint32_t read_reqs[LATENCY_BUCKETS + 1]; uint32_t write_reqs[LATENCY_BUCKETS + 1]; int max_read_latency; int max_write_latency; long long total_measurements; }; extern struct latency_counters latency_count; // global config (CLI) configurable vars extern uint8_t is_CR; extern int update_ratio; extern int rmw_ratio; extern int num_workers; extern int credits_num; extern int max_coalesce; extern int max_batch_size; // for batches to KVS extern int machine_num; // must be smaller or equal to MAX_MACHINE_NUM extern int remote_machine_num; // must be smaller or equal to MAX_MACHINE_NUM extern int worker_measuring_latency; // extern int value_size; // must be smaller or equal to MAX_MACHINE_NUM #endif // SPACETIME_CONFIG_H ================================================ FILE: include/hermes/inline-util.h ================================================ // // Created by akatsarakis on 23/05/18. // #ifndef HERMES_INLINE_UTIL_H #define HERMES_INLINE_UTIL_H #include #include "../hades/hades.h" #include "../utils/concur_ctrl.h" #include "config.h" #include "spacetime.h" #include "util.h" /* --------------------------------------------------------------------------- ----------------------------------- MEMBERSHIP ------------------------------- ---------------------------------------------------------------------------*/ static inline uint8_t node_is_in_membership(spacetime_group_membership last_group_membership, int node_id) { return (uint8_t)(bv_bit_get(last_group_membership.g_membership, (uint8_t)node_id) == 1 ? 1 : 0); } static inline void group_membership_update(hades_ctx_t hades_ctx) { seqlock_lock(&group_membership.lock); bv_copy((bit_vector_t*)&group_membership.g_membership, hades_ctx.curr_g_membership); bv_copy((bit_vector_t*)&group_membership.w_ack_init, group_membership.g_membership); bv_reverse((bit_vector_t*)&group_membership.w_ack_init); bv_bit_set((bit_vector_t*)&group_membership.w_ack_init, (uint8_t)machine_id); group_membership.num_of_alive_remotes = bv_no_setted_bits(group_membership.g_membership); seqlock_unlock(&group_membership.lock); if (group_membership.num_of_alive_remotes < (machine_num / 2)) { colored_printf(RED, "Majority is down!\n"); exit(-1); } } static inline uint8_t group_membership_has_changed(spacetime_group_membership* last_group_membership, uint16_t worker_lid) { uint32_t debug_lock_free_membership_read_cntr = 0; spacetime_group_membership lock_free_read_group_membership; do { // Lock free read of group membership if (ENABLE_ASSERTIONS) { debug_lock_free_membership_read_cntr++; if (debug_lock_free_membership_read_cntr == M_4) { printf("Worker %u stuck on a lock-free read (for group membership)\n", worker_lid); debug_lock_free_membership_read_cntr = 0; } } lock_free_read_group_membership = *((spacetime_group_membership*)&group_membership); } while (!(seqlock_version_is_same_and_valid( &group_membership.lock, &lock_free_read_group_membership.lock))); for (int i = 0; i < GROUP_MEMBERSHIP_ARRAY_SIZE; i++) if (!bv_are_equal(lock_free_read_group_membership.g_membership, last_group_membership->g_membership)) { *last_group_membership = lock_free_read_group_membership; return 1; } return 0; } /* --------------------------------------------------------------------------- ----------------------------------- LATENCY ------------------------------- ---------------------------------------------------------------------------*/ // Add latency to histogram (in microseconds) static inline void bookkeep_latency(int useconds, uint8_t op) { uint32_t* latency_array; int* max_latency_ptr; switch (op) { case ST_OP_PUT: latency_array = latency_count.write_reqs; max_latency_ptr = &latency_count.max_write_latency; break; case ST_OP_GET: latency_array = latency_count.read_reqs; max_latency_ptr = &latency_count.max_read_latency; break; default: assert(0); } latency_count.total_measurements++; if (useconds > MAX_LATENCY) latency_array[LATENCY_BUCKETS]++; else latency_array[useconds / LATENCY_PRECISION]++; if (*max_latency_ptr < useconds) *max_latency_ptr = useconds; } // Necessary bookkeeping to initiate the latency measurement static inline void start_latency_measurement(struct timespec* start) { clock_gettime(CLOCK_MONOTONIC, start); } static inline void stop_latency_measurment(uint8_t req_opcode, struct timespec* start) { struct timespec end; clock_gettime(CLOCK_MONOTONIC, &end); int useconds = (int)(((end.tv_sec - start->tv_sec) * 1000000) + ((end.tv_nsec - start->tv_nsec) / 1000)); if (ENABLE_ASSERTIONS) assert(useconds >= 0); // printf("Latency of %s %u us\n", code_to_str(req_opcode), useconds); bookkeep_latency(useconds, req_opcode); } static inline void stop_latency_of_completed_writes(spacetime_op_t* ops, uint16_t worker_lid, struct timespec* stopwatch) { if (machine_id == 0 && worker_lid == worker_measuring_latency) if (ops[0].op_meta.opcode == ST_OP_PUT && (ops[0].op_meta.state == ST_MISS || ops[0].op_meta.state == ST_PUT_COMPLETE)) stop_latency_measurment(ops[0].op_meta.opcode, stopwatch); } static inline void stop_latency_of_completed_reads(spacetime_op_t* ops, uint16_t worker_lid, struct timespec* stopwatch) { if (machine_id == 0 && worker_lid == worker_measuring_latency) if (ops[0].op_meta.opcode == ST_OP_GET && (ops[0].op_meta.state == ST_MISS || ops[0].op_meta.state == ST_GET_COMPLETE)) stop_latency_measurment(ops[0].op_meta.opcode, stopwatch); } /* --------------------------------------------------------------------------- ---------------------------------- Refill Requests --------------------------- ---------------------------------------------------------------------------*/ static inline int refill_ops(uint32_t* trace_iter, uint16_t worker_lid, struct spacetime_trace_command* trace, spacetime_op_t* ops, uint32_t* refilled_per_ops_debug_cnt, struct timespec* start, spacetime_op_t** n_hottest_keys_in_ops_get, spacetime_op_t** n_hottest_keys_in_ops_put) { static uint8_t first_iter_has_passed[MAX_WORKERS_PER_MACHINE] = {0}; int refilled_ops = 0, node_suspected = -1; for (int i = 0; i < max_batch_size; i++) { if (ENABLE_ASSERTIONS && first_iter_has_passed[worker_lid] == 1) { assert(ops[i].op_meta.opcode == ST_OP_PUT || ops[i].op_meta.opcode == ST_OP_GET || (is_CR == 0 && ops[i].op_meta.opcode == ST_OP_RMW)); assert(ops[i].op_meta.state == ST_PUT_COMPLETE || ops[i].op_meta.state == ST_GET_COMPLETE || ops[i].op_meta.state == ST_PUT_SUCCESS || ops[i].op_meta.state == ST_REPLAY_SUCCESS || ops[i].op_meta.state == ST_NEW || ops[i].op_meta.state == ST_MISS || ops[i].op_meta.state == ST_PUT_STALL || ops[i].op_meta.state == ST_REPLAY_COMPLETE || ops[i].op_meta.state == ST_IN_PROGRESS_PUT || // ops[i].op_meta.state == ST_RMW_STALL || ops[i].op_meta.state == ST_RMW_ABORT || ops[i].op_meta.state == ST_RMW_SUCCESS || ops[i].op_meta.state == ST_RMW_COMPLETE || ops[i].op_meta.state == ST_IN_PROGRESS_RMW || // ops[i].op_meta.state == // ST_IN_PROGRESS_PUT //|| ops[i].op_meta.state == ST_IN_PROGRESS_GET || ops[i].op_meta.state == ST_IN_PROGRESS_REPLAY || ops[i].op_meta.state == ST_OP_MEMBERSHIP_CHANGE || /// TODO check this ops[i].op_meta.state == ST_OP_MEMBERSHIP_COMPLETE || /// TODO check this ops[i].op_meta.state == ST_PUT_COMPLETE_SEND_VALS || ops[i].op_meta.state == ST_GET_STALL); } if (first_iter_has_passed[worker_lid] == 0 || ops[i].op_meta.state == ST_MISS || ops[i].op_meta.state == ST_PUT_COMPLETE || ops[i].op_meta.state == ST_RMW_ABORT || ops[i].op_meta.state == ST_RMW_COMPLETE || ops[i].op_meta.state == ST_OP_MEMBERSHIP_COMPLETE || ops[i].op_meta.state == ST_GET_COMPLETE) { if (first_iter_has_passed[worker_lid] != 0) { if (ENABLE_REQ_PRINTS && worker_lid < MAX_THREADS_TO_PRINT) colored_printf( GREEN, "W%d--> Key Hash:%" PRIu64 "\n\t\tType: %s, version %d, tie-b: %d, value(len-%d): %c\n", worker_lid, ((uint64_t*)&ops[i].op_meta.key)[0], code_to_str(ops[i].op_meta.state), ops[i].op_meta.ts.version, ops[i].op_meta.ts.tie_breaker_id, ops[i].op_meta.val_len, ops[i].value[0]); /// Stats if (ops[i].op_meta.state != ST_MISS) { if (ops[i].op_meta.state != ST_RMW_ABORT) w_stats[worker_lid].completed_ops_per_worker += ENABLE_COALESCE_OF_HOT_REQS ? ops[i].no_coales : 1; } else w_stats[worker_lid].reqs_missed_in_kvs++; if (ops[i].op_meta.state == ST_PUT_COMPLETE) w_stats[worker_lid].completed_wrs_per_worker++; else if (ops[i].op_meta.state == ST_RMW_COMPLETE) w_stats[worker_lid].completed_rmws_per_worker++; else if (ops[i].op_meta.state == ST_RMW_ABORT) w_stats[worker_lid].aborted_rmws_per_worker++; // reset op bucket ops[i].no_coales = 1; ops[i].op_meta.state = ST_EMPTY; ops[i].op_meta.opcode = ST_EMPTY; refilled_per_ops_debug_cnt[i] = 0; refilled_ops++; } if (ENABLE_ASSERTIONS) assert(trace[*trace_iter].opcode == ST_OP_PUT || trace[*trace_iter].opcode == ST_OP_RMW || trace[*trace_iter].opcode == ST_OP_GET); if (machine_id == 0 && worker_lid == worker_measuring_latency && i == 0) start_latency_measurement(start); /// INSERT new req(s) to ops uint8_t key_id; if (ENABLE_COALESCE_OF_HOT_REQS && trace[*trace_iter].opcode != ST_OP_RMW) { // see if you could coalesce any requests spacetime_op_t** n_hottest_keys_in_ops; do { key_id = trace[*trace_iter].key_id; n_hottest_keys_in_ops = trace[*trace_iter].opcode == ST_OP_GET ? n_hottest_keys_in_ops_get : n_hottest_keys_in_ops_put; // if we can coalesce (a hot) req if (key_id < COALESCE_N_HOTTEST_KEYS && // is a hot key n_hottest_keys_in_ops[key_id] != NULL && // exists in the ops array n_hottest_keys_in_ops[key_id]->op_meta.opcode == trace[*trace_iter] .opcode) // has the same code with the last inserted { n_hottest_keys_in_ops[key_id]->no_coales++; *trace_iter = trace[*trace_iter + 1].opcode != NOP ? *trace_iter + 1 : 0; } else break; } while (1); if (key_id < COALESCE_N_HOTTEST_KEYS) n_hottest_keys_in_ops[key_id] = &ops[i]; } ops[i].op_meta.state = ST_NEW; ops[i].op_meta.opcode = (uint8_t)(CR_ENABLE_ALL_NODES_GETS_EXCEPT_HEAD && machine_id != 0 ? ST_OP_GET : trace[*trace_iter].opcode); memcpy(&ops[i].op_meta.key, &trace[*trace_iter].key_hash, sizeof(spacetime_key_t)); if (ops[i].op_meta.opcode == ST_OP_PUT || ops[i].op_meta.opcode == ST_OP_RMW) memset(ops[i].value, ((uint8_t)'a' + machine_id), ST_VALUE_SIZE); else if (ENABLE_READ_COMPLETE_AFTER_VAL_RECV_OF_HOT_REQS) { // if its a read reset the timestamp ops[i].op_meta.ts.version = 0; ops[i].op_meta.ts.tie_breaker_id = 0; } ops[i].RMW_flag = ops[i].op_meta.opcode == ST_OP_RMW ? 1 : 0; ops[i].op_meta.val_len = (uint8)(ops[i].op_meta.opcode == ST_OP_GET ? 0 : ST_VALUE_SIZE >> SHIFT_BITS); // instead of MOD add *trace_iter = trace[*trace_iter + 1].opcode != NOP ? *trace_iter + 1 : 0; if (ENABLE_REQ_PRINTS && worker_lid < MAX_THREADS_TO_PRINT) colored_printf(RED, "W%d--> Op: %s, hash(1st 8B):%" PRIu64 "\n", worker_lid, code_to_str(ops[i].op_meta.opcode), ((uint64_t*)&ops[i].op_meta.key)[0]); } else refilled_per_ops_debug_cnt[i]++; } if (refilled_ops == 0) w_stats[worker_lid].wasted_loops++; if (first_iter_has_passed[worker_lid] == 0) first_iter_has_passed[worker_lid] = 1; if (ENABLE_ASSERTIONS) for (int i = 0; i < max_batch_size; i++) assert(ops[i].op_meta.opcode == ST_OP_PUT || ops[i].op_meta.opcode == ST_OP_GET || (ops[i].op_meta.opcode == ST_OP_RMW && is_CR == 0)); return node_suspected; } #endif // HERMES_INLINE_UTIL_H ================================================ FILE: include/hermes/spacetime.h ================================================ // // Created by akatsarakis on 04/05/18. // #ifndef HERMES_SPACETIME_H #define HERMES_SPACETIME_H // Optik Options #ifndef CORE_NUM #define DEFAULT #define CORE_NUM 8 #endif #include "../utils/bit_vector.h" #include "../utils/concur_ctrl.h" #include "config.h" #include "hrd.h" #include "mica.h" #define SPACETIME_NUM_KEYS (1000 * 1000) #define SPACETIME_NUM_BKTS (2 * 1024 * 1024) #define SPACETIME_LOG_CAP (1024 * 1024 * 1024) //#define SPACETIME_NUM_KEYS (60 * 1000 * 1000) //#define SPACETIME_NUM_BKTS (64 * 1024 * 1024) //#define SPACETIME_LOG_CAP (4 * ((unsigned long long) M_1024)) //(1024 * 1024 //* 1024) #define ST_VALUE_SIZE (KVS_VALUE_SIZE - sizeof(spacetime_object_meta)) // Special EMPTY opcodes #define NOP 150 // trace #define LAST_WRITER_ID_EMPTY 127 // 255 #define ST_OP_BUFFER_INDEX_EMPTY 255 ///////////////////////////////////////////// //// ENUMS ///////////////////////////////////////////// /// WARNING the monotonically increasing assigned numbers to States are used for /// comparisons (do not reorder / change numbers) // States typedef enum { VALID_STATE = 1, INVALID_STATE, INVALID_WRITE_STATE, WRITE_STATE, REPLAY_STATE, } __attribute__((packed)) hermes_states_t; // Input Opcodes typedef enum { ST_OP_GET = 111, ST_OP_PUT, ST_OP_RMW, ST_OP_INV, ST_OP_ACK, ST_OP_VAL, ST_OP_CRD, ST_OP_MEMBERSHIP_CHANGE, ST_OP_MEMBERSHIP_COMPLETE // 119 } __attribute__((packed)) input_opcodes_t; // Response Opcodes typedef enum { ST_GET_COMPLETE = 121, ST_PUT_SUCCESS, // broadcast invs ST_REPLAY_SUCCESS, // broadcast invs ST_INV_SUCCESS, // send ack ST_ACK_SUCCESS, ST_LAST_ACK_SUCCESS, // complete local write ST_LAST_ACK_NO_BCAST_SUCCESS, // complete local write ST_PUT_COMPLETE, // broadcast invs ST_VAL_SUCCESS, // 129 ST_MISS, // 130 ST_GET_STALL, ST_PUT_STALL, ST_PUT_COMPLETE_SEND_VALS, ST_SEND_CRD, // 134 // RMW opcodes ST_RMW_SUCCESS, // 135 ST_RMW_STALL, ST_RMW_COMPLETE, ST_RMW_ABORT, ST_OP_INV_ABORT, // 139 //send inv instead of ACK } __attribute__((packed)) response_opcodes_t; // ops bucket states typedef enum { ST_EMPTY = 140, ST_NEW, ST_COMPLETE, ST_IN_PROGRESS_PUT, ST_IN_PROGRESS_REPLAY, ST_REPLAY_COMPLETE, ST_IN_PROGRESS_GET, // Used only in Chain Replication ST_REPLAY_COMPLETE_SEND_VALS, ST_IN_PROGRESS_RMW, ST_RMW_COMPLETE_SEND_VALS // 149 } __attribute__((packed)) op_bucket_states_t; // failure detection (deprecated) typedef enum { ST_OP_HEARTBEAT = 151, // WARNING: 150 opcode is used (see NOP define)!! ST_OP_SUSPICION, ST_INV_OUT_OF_GROUP } __attribute__((packed)) fs_ops_t; // receive_buff_types typedef enum { ST_INV_BUFF = 161, ST_ACK_BUFF, ST_VAL_BUFF, ST_CRD_BUFF } __attribute__((packed)) rcv_buff_types_t; ///////////////////////////////////////////// //// Hermes(msg and KV -- spacetime) structs ///////////////////////////////////////////// // Fixed-size 8 (or 16) byte keys typedef struct { // uint64 __unused; // This should be 8B ////// Uncomment this for // fixed-size 16 byte keys instead uint64_t bkt : 48; unsigned int tag : 16; } spacetime_key_t; typedef volatile struct { hermes_states_t state; bit_vector_t ack_bv; uint8_t RMW_flag : 1; uint8_t last_writer_id : 7; uint8_t op_buffer_index; // TODO change to uint16_t for a buffer >= 256 conc_ctrl_t cctrl; timestamp_t last_local_write_ts; } spacetime_object_meta; typedef struct { spacetime_key_t key; /* This must be the 1st field and 8B or 16B aligned */ uint8_t opcode; // both recv / resp //TODO create a union union { uint8_t state; // HERMES: used by spacetime_op_t uint8_t sender; // HERMES: used by spacetime_inv/ack/val_t uint8_t initiator; // CR: used by spacetime_inv/ack }; union { uint8_t val_len; // HERMES: unused for spacetime_ack_t and spacetime_val_t // (align for using a single memcpy) uint8_t buff_idx; // CR: used for spacetime_ack_t buffer index of // write initiated this req }; timestamp_t ts; } spacetime_op_meta_t, spacetime_ack_t, spacetime_val_t; typedef struct { spacetime_op_meta_t op_meta; // op_t/inv_t: uses the state/sender part of the // op_meta union (not sender/state) union { struct { // Hermes struct uint8_t RMW_flag : 1; // 1 indicates RMWs while 0 normal writes uint16_t no_coales : 15; // used only for skew optimizations }; struct { // CR struct uint8_t buff_idx; // for spacetime_inv_t buffer index of write // initiated this req uint8_t initiator; // for spacetime_inv_t buffer index of write // initiated this req }; }; uint8_t value[ST_VALUE_SIZE]; } spacetime_op_t, spacetime_inv_t; typedef struct { volatile uint8_t num_of_alive_remotes; volatile bit_vector_t g_membership; volatile bit_vector_t w_ack_init; seqlock_t lock; } spacetime_group_membership; struct spacetime_kv { // TODO may add kvs stats struct mica_kv hash_table; }; struct spacetime_trace_command { spacetime_key_t key_hash; uint8_t opcode; uint8_t key_id; // stores key ids 0-254 otherwise it is set to 255 to // indicate other key ids }; void spacetime_init(int spacetime_id); void spacetime_populate_fixed_len(struct spacetime_kv* kv, int n, int val_len); /////////////////////////////////////// //////////////////// Hermes /////////////////////////////////////// enum hermes_batch_type_t { local_ops, local_ops_after_membership_change, invs, acks, vals }; void hermes_batch_ops_to_KVS(enum hermes_batch_type_t type, uint8_t* op_array, int op_num, uint16_t sizeof_op_elem, spacetime_group_membership curr_membership, int* node_suspected, spacetime_op_t* read_write_ops, uint8_t thread_id); /////////////////////////////////////// //////////////////// CR(AQ) /////////////////////////////////////// enum cr_type_t { Local_ops, // All nodes Remote_writes, // Head Remote_reads, // Tail Invs, // All except Head Acks // All except Tail }; void cr_batch_ops_to_KVS(enum cr_type_t cr_type, uint8_t* op_array, int op_num, uint16_t sizeof_op_elem, spacetime_op_t* read_write_op); /////////////////////////////////////// //////////////////// Helpers /////////////////////////////////////// static inline uint8_t is_last_ack(bit_vector_t gathered_acks, spacetime_group_membership curr_g_membership) { bv_and(&gathered_acks, curr_g_membership.g_membership); return bv_are_equal(gathered_acks, curr_g_membership.g_membership); } // TODO: adapt and use the following functions to re-enable variable length // object support static inline uint8_t get_val_len(struct mica_op* op_t) { return (op_t->val_len >> SHIFT_BITS) - sizeof(spacetime_op_meta_t); } static inline uint8_t set_val_len(spacetime_op_meta_t* op_t) { return (op_t->val_len >> SHIFT_BITS) + sizeof(spacetime_op_meta_t); } extern struct spacetime_kv kv; extern spacetime_group_membership group_membership; #endif // HERMES_SPACETIME_H ================================================ FILE: include/hermes/util.h ================================================ // // Created by akatsarakis on 15/03/18. // #ifndef HERMES_UTIL_H #define HERMES_UTIL_H #include #include #include #include "config.h" #include "hrd.h" #include "spacetime.h" struct worker_stats { long long completed_ops_per_worker; long long completed_wrs_per_worker; long long completed_rmws_per_worker; long long aborted_rmws_per_worker; long long reqs_missed_in_kvs; long long issued_invs_per_worker; long long issued_acks_per_worker; long long issued_vals_per_worker; long long issued_crds_per_worker; long long issued_packet_invs_per_worker; long long issued_packet_acks_per_worker; long long issued_packet_vals_per_worker; long long issued_packet_crds_per_worker; long long inv_ss_completions_per_worker; long long ack_ss_completions_per_worker; long long val_ss_completions_per_worker; long long crd_ss_completions_per_worker; long long received_invs_per_worker; long long received_acks_per_worker; long long received_vals_per_worker; long long received_crds_per_worker; long long received_packet_invs_per_worker; long long received_packet_acks_per_worker; long long received_packet_vals_per_worker; long long received_packet_crds_per_worker; long long received_acks_stalled; // for faking tail-latency long long stalled_time_per_worker; long long wasted_loops; long long total_loops; double empty_reqs_per_trace; long long cold_keys_per_trace; double tot_empty_reqs_per_trace; }; struct stats { double xput_per_worker[MAX_WORKERS_PER_MACHINE]; double rmw_xput_per_worker[MAX_WORKERS_PER_MACHINE]; double rmw_abort_rate_per_worker[MAX_WORKERS_PER_MACHINE]; double issued_invs_avg_coalesing[MAX_WORKERS_PER_MACHINE]; double issued_acks_avg_coalesing[MAX_WORKERS_PER_MACHINE]; double issued_vals_avg_coalesing[MAX_WORKERS_PER_MACHINE]; double issued_crds_avg_coalesing[MAX_WORKERS_PER_MACHINE]; double received_invs_avg_coalesing[MAX_WORKERS_PER_MACHINE]; double received_acks_avg_coalesing[MAX_WORKERS_PER_MACHINE]; double received_vals_avg_coalesing[MAX_WORKERS_PER_MACHINE]; double received_crds_avg_coalesing[MAX_WORKERS_PER_MACHINE]; double percentage_of_wasted_loops[MAX_WORKERS_PER_MACHINE]; double completed_reqs_per_loop[MAX_WORKERS_PER_MACHINE]; // long long issued_packet_acks_per_worker; double batch_size_per_worker[MAX_WORKERS_PER_MACHINE]; double empty_reqs_per_worker[MAX_WORKERS_PER_MACHINE]; double stalled_time_per_worker[MAX_WORKERS_PER_MACHINE]; double average_coalescing_per_worker[MAX_WORKERS_PER_MACHINE]; double acks_per_worker[MAX_WORKERS_PER_MACHINE]; double invs_per_worker[MAX_WORKERS_PER_MACHINE]; double updates_per_worker[MAX_WORKERS_PER_MACHINE]; double write_ratio_per_worker[MAX_WORKERS_PER_MACHINE]; }; // init all stats to 0 static inline void init_stats(struct worker_stats* w_stats) { memset(w_stats, 0, sizeof(struct worker_stats) * MAX_WORKERS_PER_MACHINE); } void trace_init(struct spacetime_trace_command** trace, uint16_t worker_lid); void* run_worker(void* arg); void* print_stats_thread(void* no_arg); void dump_latency_stats(void); // Maybe inline these uint8_t is_state_code(uint8_t code); uint8_t is_input_code(uint8_t code); uint8_t is_response_code(uint8_t code); uint8_t is_bucket_state_code(uint8_t code); int spawn_stats_thread(void); char* code_to_str(uint8_t code); void setup_kvs_buffs(spacetime_op_t** ops, spacetime_inv_t** inv_recv_ops, spacetime_ack_t** ack_recv_ops, spacetime_val_t** val_recv_ops); extern dbit_vector_t* g_share_qs_barrier; extern volatile struct worker_stats w_stats[MAX_WORKERS_PER_MACHINE]; #endif // HERMES_UTIL_H ================================================ FILE: include/mica-herd/city.h ================================================ // city.h - cityhash-c // CityHash on C // Copyright (c) 2011-2012, Alexander Nusov // // - original copyright notice - // Copyright (c) 2011 Google, Inc. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // // CityHash, by Geoff Pike and Jyrki Alakuijala // // This file provides a few functions for hashing strings. On x86-64 // hardware in 2011, CityHash64() is faster than other high-quality // hash functions, such as Murmur. This is largely due to higher // instruction-level parallelism. CityHash64() and CityHash128() also perform // well on hash-quality tests. // // CityHash128() is optimized for relatively long strings and returns // a 128-bit hash. For strings more than about 2000 bytes it can be // faster than CityHash64(). // // Functions in the CityHash family are not suitable for cryptography. // // WARNING: This code has not been tested on big-endian platforms! // It is known to work well on little-endian platforms that have a small penalty // for unaligned reads, such as current Intel and AMD moderate-to-high-end CPUs. // // By the way, for some hash functions, given strings a and b, the hash // of a+b is easily derived from the hashes of a and b. This property // doesn't hold for any hash functions in this file. #ifndef CITY_HASH_H_ #define CITY_HASH_H_ #include #include typedef uint8_t uint8; typedef uint32_t uint32; typedef uint64_t uint64; typedef struct _uint128 uint128; struct _uint128 { uint64 first; uint64 second; }; #define Uint128Low64(x) (x).first #define Uint128High64(x) (x).second // Hash function for a byte array. uint64 CityHash64(const char* buf, size_t len); // Hash function for a byte array. For convenience, a 64-bit seed is also // hashed into the result. uint64 CityHash64WithSeed(const char* buf, size_t len, uint64 seed); // Hash function for a byte array. For convenience, two seeds are also // hashed into the result. uint64 CityHash64WithSeeds(const char* buf, size_t len, uint64 seed0, uint64 seed1); // Hash function for a byte array. uint128 CityHash128(const char* s, size_t len); // Hash function for a byte array. For convenience, a 128-bit seed is also // hashed into the result. uint128 CityHash128WithSeed(const char* s, size_t len, uint128 seed); #endif // CITY_HASH_H_ ================================================ FILE: include/mica-herd/hrd.h ================================================ #ifndef HRD_H #define HRD_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sizes.h" // Multicast // TODO we do not use hw multicast because it helps only on master-based // patterns //#include #include #include #include #include #include // #define USE_BIG_OBJECTS 0 #define EXTRA_CACHE_LINES 0 #define BASE_VALUE_SIZE 46 // max is --> 46 #define SHIFT_BITS \ (USE_BIG_OBJECTS == 1 ? 3 : 0) // number of bits to shift left or right to // calculate the value length #define HRD_DEFAULT_PSN \ 3185 /* PSN for all queues */ // starting Packet Sequence Number #define HRD_DEFAULT_QKEY 0x11111111 #define HRD_QP_NAME_SIZE 200 /* Size (in bytes) of a queue pair name */ #define HRD_RESERVED_NAME_PREFIX "__HRD_RESERVED_NAME_PREFIX" #define KVS_VALUE_SIZE \ (USE_BIG_OBJECTS == 1 \ ? ((EXTRA_CACHE_LINES * 64) + BASE_VALUE_SIZE) \ : BASE_VALUE_SIZE) //(169 + 64)// 46 + 64 + 64//32 //(46 + 64) #define HUGE_PAGE_SIZE 2097152 #define LEVERAGE_TLB_COALESCING 1 /* * Small max_inline_data reduces the QP's max WQE size, which reduces the * DMA size in doorbell method of WQE fetch. */ #define HRD_MAX_INLINE \ 188 //(USE_BIG_OBJECTS == 1 ? ((EXTRA_CACHE_LINES * 64) + 60) : 60) //60 is // what kalia had here// // This is required for ROCE not sure yet why // #define IB_PHYS_PORT 1 // // #define USE_HUGE_PAGES 1 // #ifndef likely #define likely(x) __builtin_expect(!!(x), 1) #endif #ifndef unlikely #define unlikely(x) __builtin_expect(!!(x), 0) #endif /* Compare, print, and exit */ #define CPE(val, msg, err_code) \ if (unlikely(val)) { \ fprintf(stderr, msg); \ fprintf(stderr, " Error %d \n", err_code); \ exit(err_code); \ } /* vasilis added a ceiling and a MAX*/ #define CEILING(x, y) (((x) + (y)-1) / (y)) #define MAX(x, y) (x > y ? x : y) int is_roce; int machine_id; char *remote_IP, *local_IP; /* Registry info about a QP */ struct hrd_qp_attr { char name[HRD_QP_NAME_SIZE]; // ROCE uint64_t gid_global_interface_id; // Store the gid fields separately because I uint64_t gid_global_subnet_prefix; // don't like unions. Needed for RoCE only /* Info about the RDMA buffer associated with this QP */ uintptr_t buf_addr; uint32_t buf_size; uint32_t rkey; int lid; int qpn; uint8_t sl; }; struct hrd_ud_ctrl_blk { int local_hid; /* Local ID on the machine this process runs on */ /* Info about the device/port to use for this control block */ struct ibv_context* ctx; int device_id; /* Resovled by libhrd from @port_index */ int dev_port_id; /* 1-based within dev @device_id. Resolved by libhrd */ int numa_node_id; /* NUMA node id */ struct ibv_pd* pd; /* A protection domain for this control block */ /* Datagram QPs */ int num_dgram_qps; struct ibv_qp** dgram_qp; struct ibv_cq **dgram_send_cq, **dgram_recv_cq; volatile uint8_t* dgram_buf; /* A buffer for RECVs on dgram QPs */ int* recv_q_depth; int* send_q_depth; int dgram_buf_shm_key; struct ibv_mr* dgram_buf_mr; }; /* Major initialzation functions */ struct hrd_ud_ctrl_blk* hrd_ud_ctrl_blk_init( int local_hid, int port_index, int numa_node_id, /* -1 means don't use hugepages */ int num_dgram_qps, int dgram_buf_size, int dgram_buf_shm_key, int* recv_q_depth, int* send_q_depth); int hrd_ud_ctrl_blk_destroy(struct hrd_ud_ctrl_blk* cb); /* RDMA resolution functions */ struct ibv_device* hrd_resolve_port_index(struct hrd_ud_ctrl_blk* cb, int port_index); uint16_t hrd_get_local_lid(struct ibv_context* ctx, int port_id); void hrd_create_dgram_qps(struct hrd_ud_ctrl_blk* cb); /* Fill @wc with @num_comps comps from this @cq. Exit on error. */ static inline uint32_t hrd_poll_cq(struct ibv_cq* cq, int num_comps, struct ibv_wc* wc) { int comps = 0; uint32_t debug_cnt = 0; while (comps < num_comps) { if (debug_cnt > M_256) { printf("Someone is stuck waiting for a completion %d / %d \n", comps, num_comps); debug_cnt = 0; } int new_comps = ibv_poll_cq(cq, num_comps - comps, &wc[comps]); if (new_comps != 0) { // printf("I see completions %d\n", new_comps); /* Ideally, we should check from comps -> new_comps - 1 */ if (wc[comps].status != 0) { fprintf(stderr, "Bad wc status %d\n", wc[comps].status); exit(0); } comps += new_comps; } debug_cnt++; } return debug_cnt; } static inline struct ibv_mr* register_buffer(struct ibv_pd* pd, void* buf, uint32_t size) { int ib_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC; struct ibv_mr* mr = ibv_reg_mr(pd, (char*)buf, size, ib_flags); assert(mr != NULL); return mr; } /* Registry functions */ void hrd_publish(const char* key, void* value, int len); int hrd_get_published(const char* key, void** value); ///* Publish the nth connected queue pair from this cb with this name */ // void hrd_publish_conn_qp(struct hrd_ud_ctrl_blk *cb, int n, const char // *qp_name); /* Publish the nth datagram queue pair from this cb with this name */ void hrd_publish_dgram_qp(struct hrd_ud_ctrl_blk* cb, int n, const char* qp_name, uint8_t sl); struct hrd_qp_attr* hrd_get_published_qp(const char* qp_name); /* Utility functions */ static inline uint32_t hrd_fastrand(uint64_t* seed) { *seed = *seed * 1103515245 + 12345; return (uint32_t)(*seed >> 32); } void* hrd_malloc_socket(int shm_key, uint64_t size, int socket_id); int hrd_free(int shm_key, void* shm_buf); char* hrd_getenv(const char* name); // Like printf, but colorfur. Limited to 1000 characters. typedef enum { YELLOW = 0, RED, GREEN, CYAN } color_print_t; void colored_printf(color_print_t color, const char* format, ...); extern char dev_name[50]; #endif /* HRD_H */ ================================================ FILE: include/mica-herd/mica.h ================================================ #ifndef MICA_H #define MICA_H #include #include "city.h" #include "hrd.h" /* * The polling logic in HERD requires the following: * 1. 0 < MICA_OP_GET < MICA_OP_PUT < HERD_OP_GET < HERD_OP_PUT * 2. HERD_OP_GET = MICA_OP_GET + HERD_MICA_OFFSET * 3. HERD_OP_PUT = MICA_OP_PUT + HERD_MICA_OFFSET * * This allows us to detect HERD requests by checking if the request region * opcode is more than MICA_OP_PUT. And then we can convert a HERD opcode to * a MICA opcode by subtracting HERD_MICA_OFFSET from it. */ #define MICA_OP_PUT 112 /* Ensure that a mica_op is cacheline aligned */ #define MICA_OP_METADATA \ (sizeof(struct mica_key) + sizeof(uint8_t) + sizeof(uint8_t)) #define MICA_MIN_VALUE (64 - MICA_OP_METADATA) #define MICA_MAX_VALUE \ (USE_BIG_OBJECTS == 1 ? (MICA_MIN_VALUE + (EXTRA_CACHE_LINES * 64)) \ : MICA_MIN_VALUE) #define MICA_LOG_BITS 40 #define MICA_INDEX_SHM_KEY 3185 #define MICA_LOG_SHM_KEY 4185 /* * Debug values: * 0: No safety checks on fast path * 1: Sanity checks for arguments * 2: Pretty print GET/PUT operations */ #define MICA_DEBUG 0 struct mica_resp { uint8_t type; uint8_t val_len; uint16_t unused[3]; /* Make val_ptr 8-byte aligned */ uint8_t* val_ptr; }; /* Fixed-size 16 byte keys */ struct mica_key { unsigned long long __unused : 64; unsigned int bkt : 32; unsigned int server : 16; unsigned int tag : 16; }; struct mica_op { struct mica_key key; /* This must be the 1st field and 16B aligned */ uint8_t opcode; uint8_t val_len; uint8_t value[MICA_MAX_VALUE]; }; struct mica_slot { uint32_t in_use : 1; uint32_t tag : (64 - MICA_LOG_BITS - 1); uint64_t offset : MICA_LOG_BITS; }; struct mica_bkt { struct mica_slot slots[8]; }; struct mica_kv { struct mica_bkt* ht_index; uint8_t* ht_log; /* Metadata */ int instance_id; /* ID of this MICA instance. Used for shm keys */ uint64_t num_bkts; /* Number of buckets requested by user */ uint64_t bkt_mask; /* Mask down from a mica_key's @bkt to a bucket */ uint64_t log_cap; /* Capacity of circular log in bytes */ uint64_t log_mask; /* Mask down from a slot's @offset to a log offset */ /* State */ uint64_t log_head; /* Stats */ long long num_insert_op; /* Number of PUT requests executed */ long long num_index_evictions; /* Number of entries evicted from index */ }; void mica_init(struct mica_kv* kv, int instance_id, int node_id, int num_bkts, uint64_t log_cap); /* Single-key INSERT */ void mica_insert_one(struct mica_kv* kv, struct mica_op* op, struct mica_resp* res); /* Helpers */ uint128* mica_gen_keys(int n); ///* Debug functions */ void mica_print_op(struct mica_op* op); #endif ================================================ FILE: include/mica-herd/sizes.h ================================================ #define K_32 32768 #define K_64 65536 #define K_128 131072 #define K_128_ 131071 #define K_256 262144 #define K_256_ 262143 #define K_512 524288 #define K_512_ 524287 #define M_1 1048576 #define M_1_ 1048575 #define M_2 2097152 #define M_2_ 2097151 #define M_4 4194304 #define M_4_ 4194303 #define M_8 8388608 #define M_8_ 8388607 #define M_16 16777216 #define M_16_ 16777215 #define M_32 33554432 #define M_32_ 33554431 #define M_128 134217728 #define M_128_ 134217727 #define M_256 268435456 #define M_256_ 268435455 #define M_512 536870912 #define M_512_ 536870911 #define M_1024 1073741824 #define M_1024_ 1073741823 #define MILLION 1000000 ================================================ FILE: include/utils/bit_vector.h ================================================ // // Created by akatsarakis on 11/12/18. // #ifndef HERMES_BIT_VECTOR_H #define HERMES_BIT_VECTOR_H #include #include #include #include #include // Change accordingly #define BV_BIT_VECTOR_SIZE \ 8 // Set if you use statical bit vector (bit_vector_t) #define BV_ENABLE_BIT_VECTOR_ASSERTS 1 // Do not change the following defines #define BV_CEILING(x, y) (((x) + (y)-1) / (y)) #define BV_BITS_IN_A_BYTE 8 #define BV_BIT_VECTOR_SIZE_IN_BYTES \ BV_CEILING(BV_BIT_VECTOR_SIZE, BV_BITS_IN_A_BYTE) #define BV_BIT_SLOT(bit) (bit / BV_BITS_IN_A_BYTE) #define BV_BIT_MOD(bit) ((uint8_t)1 << bit % BV_BITS_IN_A_BYTE) // print binary numbers #define BYTE_TO_BINARY_PATTERN "%c%c%c%c%c%c%c%c" #define BYTE_TO_BINARY(byte) \ (byte & 0x80 ? '1' : '0'), (byte & 0x40 ? '1' : '0'), \ (byte & 0x20 ? '1' : '0'), (byte & 0x10 ? '1' : '0'), \ (byte & 0x08 ? '1' : '0'), (byte & 0x04 ? '1' : '0'), \ (byte & 0x02 ? '1' : '0'), (byte & 0x01 ? '1' : '0') typedef struct { uint8_t bit_array[BV_BIT_VECTOR_SIZE_IN_BYTES]; } bit_vector_t; typedef struct { uint8_t bv_size; // in bits uint8_t* bit_array; // bit_array len == ceil(bv_size / 8) } dbit_vector_t; // returns the least amount of bytes that required to store x bits static inline uint16_t bv_bits_to_bytes(uint16_t bits) { return (uint16_t)BV_CEILING(bits, BV_BITS_IN_A_BYTE); } ///////////////////////////////////////// /// Internal Bitvector API functions (should not be called directly) ///////////////////////////////////////// static inline void bv_init_internal(uint8_t* bit_array, uint16_t size_in_bits) { for (int i = 0; i < bv_bits_to_bytes(size_in_bits); ++i) bit_array[i] = 0; } static inline uint8_t bv_bit_get_internal(const uint8_t* bit_array, uint16_t size_in_bits, uint8_t bit) { if (BV_ENABLE_BIT_VECTOR_ASSERTS) assert(bit < size_in_bits); return (uint8_t)((bit_array[BV_BIT_SLOT(bit)] & BV_BIT_MOD(bit)) == 0 ? 0 : 1); } static inline void bv_bit_set_internal(uint8_t* bit_array, uint16_t size_in_bits, uint8_t bit) { if (BV_ENABLE_BIT_VECTOR_ASSERTS) assert(bit < size_in_bits); bit_array[BV_BIT_SLOT(bit)] |= BV_BIT_MOD(bit); } static inline void bv_bit_reset_internal(uint8_t* bit_array, uint16_t size_in_bits, uint8_t bit) { if (BV_ENABLE_BIT_VECTOR_ASSERTS) assert(bit < size_in_bits); bit_array[BV_BIT_SLOT(bit)] &= ~(BV_BIT_MOD(bit)); } static inline void bv_set_all_internal(uint8_t* bit_array, uint16_t size_in_bits) { uint8_t bytes = (uint8_t)bv_bits_to_bytes(size_in_bits); uint8_t unused = (uint8_t)(bytes * 8 - size_in_bits); uint8_t last_byte = (uint8_t)(255 >> unused); for (int i = 0; i < bytes - 1; ++i) bit_array[i] = 255; bit_array[bytes - 1] = last_byte; } static inline void bv_reset_all_internal(uint8_t* bit_array, uint16_t size_in_bits) { for (int i = 0; i < bv_bits_to_bytes(size_in_bits); ++i) bit_array[i] = 0; } static inline uint8_t bv_are_equal_internal(uint8_t* ba1, uint16_t size_in_bits1, uint8_t* ba2, uint16_t size_in_bits2) { if (size_in_bits1 != size_in_bits2) return 0; uint16_t size_in_bytes = bv_bits_to_bytes(size_in_bits1); // shift the unused bits to avoid failing due to them // (difference only in the unused bits) uint8_t unused_ms_bits = (uint8_t)(BV_BITS_IN_A_BYTE * size_in_bytes - size_in_bits1); uint8_t last_byte1 = ba1[size_in_bytes - 1] << unused_ms_bits; uint8_t last_byte2 = ba2[size_in_bytes - 1] << unused_ms_bits; return (uint8_t)(memcmp(ba1, ba2, (size_t)(size_in_bytes - 1)) == 0 && last_byte1 == last_byte2 ? 1 : 0); } static inline void bv_copy_internal(uint8_t* ba_dst, uint16_t size_in_bits_dst, uint8_t* ba_src, uint16_t size_in_bits_src) { // allow copy only if sizes match if (size_in_bits_dst != size_in_bits_src) assert(0); memcpy(ba_dst, ba_src, bv_bits_to_bytes(size_in_bits_src)); } static inline uint8_t bv_no_setted_bits_internal(uint8_t* bit_array, uint16_t size_in_bits) { uint8_t cnt = 0; for (uint8_t i = 0; i < size_in_bits; ++i) cnt += bv_bit_get_internal(bit_array, size_in_bits, i); return cnt; } /// Bitvector Bitwise ops internal static inline void bv_reverse_internal(uint8_t* bit_array, uint16_t size_in_bits) { for (int i = 0; i < bv_bits_to_bytes(size_in_bits); ++i) bit_array[i] = ~bit_array[i]; } static inline void bv_and_internal(uint8_t* ba_dst, uint16_t size_in_bits_dst, const uint8_t* ba_src, uint16_t size_in_bits_src) { // allow and only if sizes match if (size_in_bits_dst != size_in_bits_src) assert(0); for (int i = 0; i < bv_bits_to_bytes(size_in_bits_dst); ++i) ba_dst[i] &= ba_src[i]; } static inline void bv_or_internal(uint8_t* ba_dst, uint16_t size_in_bits_dst, const uint8_t* ba_src, uint16_t size_in_bits_src) { // allow or only if sizes match if (size_in_bits_dst != size_in_bits_src) assert(0); for (int i = 0; i < bv_bits_to_bytes(size_in_bits_dst); ++i) ba_dst[i] |= ba_src[i]; } /// Bitvector Print functions static inline void bv_print_internal(const uint8_t* bit_array, uint16_t size_in_bits) { for (int i = bv_bits_to_bytes(size_in_bits) - 1; i >= 0; --i) printf(BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(bit_array[i])); } static inline void bv_print_enhanced_internal(const uint8_t* bit_array, uint16_t size_in_bits) { printf("Bit vector: "); bv_print_internal(bit_array, size_in_bits); printf("\n"); } ///////////////////////////////////////// /// Dynamic Bitvector API functions ///////////////////////////////////////// static inline void dbv_init(dbit_vector_t** bv, uint8_t size) { uint16_t bv_size_in_bytes = bv_bits_to_bytes(size); *bv = malloc(sizeof(dbit_vector_t)); (*bv)->bit_array = malloc(bv_size_in_bytes * sizeof(uint8_t)); (*bv)->bv_size = size; bv_init_internal((*bv)->bit_array, size); } static inline void dbv_destroy(dbit_vector_t* bv) { free(bv->bit_array); free(bv); } static inline uint8_t dbv_bit_get(dbit_vector_t bv, int bit) { return bv_bit_get_internal(bv.bit_array, bv.bv_size, bit); } static inline void dbv_bit_set(dbit_vector_t* bv, uint8_t bit) { bv_bit_set_internal(bv->bit_array, bv->bv_size, bit); } static inline void dbv_bit_reset(dbit_vector_t* bv, uint8_t bit) { bv_bit_reset_internal(bv->bit_array, bv->bv_size, bit); } static inline void dbv_set_all(dbit_vector_t* bv) { bv_set_all_internal(bv->bit_array, bv->bv_size); } static inline void dbv_reset_all(dbit_vector_t* bv) { bv_reset_all_internal(bv->bit_array, bv->bv_size); } static inline uint8_t dbv_no_setted_bits(dbit_vector_t bv) { return bv_no_setted_bits_internal(bv.bit_array, bv.bv_size); } static inline uint8_t dbv_are_equal(dbit_vector_t bv1, dbit_vector_t bv2) { return bv_are_equal_internal(bv1.bit_array, bv1.bv_size, bv2.bit_array, bv2.bv_size); } static inline void dbv_copy(dbit_vector_t* bv_dst, dbit_vector_t bv_src) { bv_copy_internal(bv_dst->bit_array, bv_dst->bv_size, bv_src.bit_array, bv_src.bv_size); } static inline uint8_t dbv_is_all_set(dbit_vector_t bv) { dbit_vector_t* bv_tmp; dbv_init(&bv_tmp, bv.bv_size); dbv_set_all(bv_tmp); return dbv_are_equal(bv, *bv_tmp); } /// Bitvector bitwise ops static inline void dbv_reverse(dbit_vector_t* bv) { bv_reverse_internal(bv->bit_array, bv->bv_size); } static inline void dbv_and(dbit_vector_t* bv_dst, dbit_vector_t bv_src) { bv_and_internal(bv_dst->bit_array, bv_dst->bv_size, bv_src.bit_array, bv_src.bv_size); } static inline void dbv_or(dbit_vector_t* bv_dst, dbit_vector_t bv_src) { bv_or_internal(bv_dst->bit_array, bv_dst->bv_size, bv_src.bit_array, bv_src.bv_size); } /// Bitvector Print functions static inline void dbv_print(dbit_vector_t bv) { bv_print_internal(bv.bit_array, bv.bv_size); } static inline void dbv_print_enhanced(dbit_vector_t bv) { bv_print_enhanced_internal(bv.bit_array, bv.bv_size); } static inline void dbv_unit_test(void) { dbit_vector_t* bv; dbit_vector_t* bv_set__all; dbv_init(&bv, 22); dbv_init(&bv_set__all, 22); dbv_set_all(bv_set__all); for (uint8_t i = 0; i < bv->bv_size; ++i) dbv_bit_set(bv, i); assert(dbv_are_equal(*bv, *bv_set__all) == 1); for (uint8_t i = 0; i < bv->bv_size; ++i) dbv_bit_reset(bv, i); dbv_reverse(bv); assert(dbv_are_equal(*bv, *bv_set__all) == 1); for (uint8_t i = 0; i < bv->bv_size; ++i) if (i % 2 == 0) { dbv_bit_reset(bv, i); assert(dbv_bit_get(*bv, i) == 0); } else { dbv_bit_set(bv, i); assert(dbv_bit_get(*bv, i) == 1); } dbv_reset_all(bv); assert(dbv_are_equal(*bv, *bv_set__all) == 0); dbv_set_all(bv); dbv_and(bv, *bv_set__all); assert(dbv_are_equal(*bv, *bv_set__all) == 1); dbv_copy(bv, *bv_set__all); assert(dbv_are_equal(*bv, *bv_set__all) == 1); dbv_reset_all(bv); dbv_or(bv, *bv_set__all); assert(dbv_are_equal(*bv, *bv_set__all) == 1); printf("Dynamic Bit Vector Unit Test was Successful!\n"); } ///////////////////////////////////////// /// Static Bitvector API functions ///////////////////////////////////////// static inline void bv_init(bit_vector_t* bv) { bv_init_internal(bv->bit_array, BV_BIT_VECTOR_SIZE); } static inline uint8_t bv_bit_get(bit_vector_t bv, int bit) { return bv_bit_get_internal(bv.bit_array, BV_BIT_VECTOR_SIZE, bit); } static inline void bv_bit_set(bit_vector_t* bv, uint8_t bit) { bv_bit_set_internal(bv->bit_array, BV_BIT_VECTOR_SIZE, bit); } static inline void bv_bit_reset(bit_vector_t* bv, uint8_t bit) { bv_bit_reset_internal(bv->bit_array, BV_BIT_VECTOR_SIZE, bit); } static inline void bv_set_all(bit_vector_t* bv) { bv_set_all_internal(bv->bit_array, BV_BIT_VECTOR_SIZE); } static inline void bv_reset_all(bit_vector_t* bv) { bv_reset_all_internal(bv->bit_array, BV_BIT_VECTOR_SIZE); } static inline uint8_t bv_no_setted_bits(bit_vector_t bv) { return bv_no_setted_bits_internal(bv.bit_array, BV_BIT_VECTOR_SIZE); } static inline uint8_t bv_are_equal(bit_vector_t bv1, bit_vector_t bv2) { return bv_are_equal_internal(bv1.bit_array, BV_BIT_VECTOR_SIZE, bv2.bit_array, BV_BIT_VECTOR_SIZE); } static inline void bv_copy(bit_vector_t* bv_dst, bit_vector_t bv_src) { bv_copy_internal(bv_dst->bit_array, BV_BIT_VECTOR_SIZE, bv_src.bit_array, BV_BIT_VECTOR_SIZE); } /// Bitvector bitwise ops static inline void bv_reverse(bit_vector_t* bv) { bv_reverse_internal(bv->bit_array, BV_BIT_VECTOR_SIZE); } static inline void bv_and(bit_vector_t* bv_dst, bit_vector_t bv_src) { bv_and_internal(bv_dst->bit_array, BV_BIT_VECTOR_SIZE, bv_src.bit_array, BV_BIT_VECTOR_SIZE); } static inline void bv_or(bit_vector_t* bv_dst, bit_vector_t bv_src) { bv_or_internal(bv_dst->bit_array, BV_BIT_VECTOR_SIZE, bv_src.bit_array, BV_BIT_VECTOR_SIZE); } /// Bitvector Print functions static inline void bv_print(bit_vector_t bv) { bv_print_internal(bv.bit_array, BV_BIT_VECTOR_SIZE); } static inline void bv_print_enhanced(bit_vector_t bv) { bv_print_enhanced_internal(bv.bit_array, BV_BIT_VECTOR_SIZE); } ///////////////////////////////////////// /// Bitvector unit test functions ///////////////////////////////////////// static inline void bv_unit_test(void) { bit_vector_t bv; bit_vector_t bv_set__all; bv_init(&bv); bv_set_all(&bv_set__all); dbv_unit_test(); for (uint8_t i = 0; i < BV_BIT_VECTOR_SIZE; ++i) bv_bit_set(&bv, i); assert(bv_are_equal(bv, bv_set__all) == 1); for (uint8_t i = 0; i < BV_BIT_VECTOR_SIZE; ++i) bv_bit_reset(&bv, i); bv_reverse(&bv); assert(bv_are_equal(bv, bv_set__all) == 1); for (uint8_t i = 0; i < BV_BIT_VECTOR_SIZE; ++i) if (i % 2 == 0) { bv_bit_reset(&bv, i); assert(bv_bit_get(bv, i) == 0); } else { bv_bit_set(&bv, i); assert(bv_bit_get(bv, i) == 1); } bv_reset_all(&bv); assert(bv_are_equal(bv, bv_set__all) == 0); bv_set_all(&bv); bv_and(&bv, bv_set__all); assert(bv_are_equal(bv, bv_set__all) == 1); bv_copy(&bv, bv_set__all); assert(bv_are_equal(bv, bv_set__all) == 1); bv_reset_all(&bv); bv_or(&bv, bv_set__all); assert(bv_are_equal(bv, bv_set__all) == 1); printf("Static Bit Vector Unit Test was Successful!\n"); } #endif // HERMES_BIT_VECTOR_H ================================================ FILE: include/utils/concur_ctrl.h ================================================ // // Created by akatsarakis on 11/12/18. // #ifndef HERMES_SEQLOCK_H #define HERMES_SEQLOCK_H #include #include #define ENABLE_LOCK_ASSERTS 1 #define TIE_BREAKER_ID_EMPTY 255 #define SEQLOCK_LOCKED 0x1 #define SEQLOCK_FREE 0x0 #define LOCK_PAUSE() asm volatile("mfence"); #define COMPILER_BARRIER() asm volatile("" ::: "memory") #if !defined(COMPILER_NO_REORDER) #define COMPILER_NO_REORDER(exec) \ COMPILER_BARRIER(); \ exec; \ COMPILER_BARRIER() #endif typedef volatile struct { uint8_t tie_breaker_id; uint32_t version; } __attribute__((packed)) timestamp_t; typedef struct { uint8_t lock; uint32_t version; /// for lock-free reads } __attribute__((packed)) seqlock_t; typedef volatile struct { uint8_t lock; timestamp_t ts; /// ts.version used for both lock-free reads & as part of timestamp } __attribute__((packed)) conc_ctrl_t; ///////////////////////////////////////// /// Timestamp comparison functions ///////////////////////////////////////// static inline void timestamp_init(timestamp_t* ts) { ts->version = 0; ts->tie_breaker_id = TIE_BREAKER_ID_EMPTY; } static inline int timestamp_is_equal(uint32_t v1, uint8_t tie_breaker1, uint32_t v2, uint8_t tie_breaker2) { return (v1 == v2 && tie_breaker1 == tie_breaker2); } static inline int timestamp_is_smaller(uint32_t v1, uint8_t tie_breaker1, uint32_t v2, uint8_t tie_breaker2) { return (v1 < v2 || (v1 == v2 && tie_breaker1 < tie_breaker2)); } ///////////////////////////////////////// /// seqlock locking / unlocking functions ///////////////////////////////////////// static inline void seqlock_init(seqlock_t* seqlock) { seqlock->version = 0; seqlock->lock = SEQLOCK_FREE; } static inline int seqlock_lock(seqlock_t* seqlock) { do { // Spin until the seqlock is unlocked while (seqlock->lock == SEQLOCK_LOCKED) { LOCK_PAUSE(); } // try to atomically get the lock via a CAS if (__sync_val_compare_and_swap(&seqlock->lock, 0, 1) == 0) { seqlock->version++; break; } } while (1); // retry if CAS failed return 1; } static inline void seqlock_unlock(seqlock_t* seqlock) { if (ENABLE_LOCK_ASSERTS) { assert(seqlock->lock == SEQLOCK_LOCKED); assert(seqlock->version % 2 == 1); } COMPILER_NO_REORDER(seqlock->version++); COMPILER_NO_REORDER(seqlock->lock = SEQLOCK_FREE); } // This is used to validate a lock-free read // i.e. --> do { } while // (!(seqlock_version_is_same_and_valid(...)); static inline int seqlock_version_is_same_and_valid(seqlock_t* seqlock1, seqlock_t* seqlock2) { return (seqlock1->version == seqlock2->version && seqlock1->version % 2 == 0); } ///////////////////////////////////////// /// ccctrl locking / unlocking functions ///////////////////////////////////////// static inline void cctrl_init(conc_ctrl_t* cctrl) { timestamp_init(&cctrl->ts); cctrl->lock = SEQLOCK_FREE; } static inline int cctrl_lock(conc_ctrl_t* cctrl) { do { // Spin until the seqlock is unlocked while (cctrl->lock == SEQLOCK_LOCKED) { LOCK_PAUSE(); } // try to atomically get the lock via a CAS if (__sync_val_compare_and_swap(&cctrl->lock, 0, 1) == 0) { cctrl->ts.version++; break; } } while (1); // retry if CAS failed return 1; } static inline void cctrl_unlock_custom_version(conc_ctrl_t* cctrl, uint8_t cid, uint32_t version) { if (ENABLE_LOCK_ASSERTS) { assert(cctrl->lock == SEQLOCK_LOCKED); assert(cctrl->ts.version % 2 == 1); } cctrl->ts.tie_breaker_id = cid; COMPILER_NO_REORDER(cctrl->ts.version = version); COMPILER_NO_REORDER(cctrl->lock = SEQLOCK_FREE); } static inline void cctrl_unlock_inc_version_by_three(conc_ctrl_t* cctrl, uint8_t cid, uint32_t* resp_version) { if (ENABLE_LOCK_ASSERTS) { assert(cctrl->lock == SEQLOCK_LOCKED); assert(cctrl->ts.version % 2 == 1); } cctrl->ts.tie_breaker_id = cid; COMPILER_NO_REORDER(cctrl->ts.version += 3); COMPILER_NO_REORDER(*resp_version = cctrl->ts.version); COMPILER_NO_REORDER(cctrl->lock = SEQLOCK_FREE); } static inline void cctrl_unlock_inc_version(conc_ctrl_t* cctrl, uint8_t cid, uint32_t* resp_version) { if (ENABLE_LOCK_ASSERTS) { assert(cctrl->lock == SEQLOCK_LOCKED); assert(cctrl->ts.version % 2 == 1); } cctrl->ts.tie_breaker_id = cid; COMPILER_NO_REORDER(*resp_version = ++cctrl->ts.version); COMPILER_NO_REORDER(cctrl->lock = SEQLOCK_FREE); } static inline void cctrl_unlock_dec_version(conc_ctrl_t* cctrl) { if (ENABLE_LOCK_ASSERTS) { assert(cctrl->lock == SEQLOCK_LOCKED); assert(cctrl->ts.version % 2 == 1); } // keep same ts.tie_breaker_id COMPILER_NO_REORDER(cctrl->ts.version--); COMPILER_NO_REORDER(cctrl->lock = SEQLOCK_FREE); } // This is used to validate a lock-free read // i.e. --> do { } while // (!(cctrl_timestamp_is_same_and_valid(...)); static inline int cctrl_timestamp_is_same_and_valid(volatile conc_ctrl_t* cctrl1, volatile conc_ctrl_t* cctrl2) { return cctrl1->ts.version % 2 == 0 && timestamp_is_equal(cctrl1->ts.version, cctrl1->ts.tie_breaker_id, cctrl2->ts.version, cctrl2->ts.tie_breaker_id); } #endif // HERMES_SEQLOCK_H ================================================ FILE: include/utils/time_rdtsc.h ================================================ #ifndef HERMES_TIME_H #define HERMES_TIME_H #include #include /* for uint64_t */ #include #include /* for struct timespec */ #define ENABLE_STATIC_TICKS_PER_NS 1 #define RDTSC_TYPICAL_TICKS_PER_NS 2.2 double g_ticks_per_ns; // assembly code to read the TSC static inline uint64_t RDTSC() { unsigned int hi, lo; __asm__ volatile("rdtsc" : "=a"(lo), "=d"(hi)); return ((uint64_t)hi << 32) | lo; } static const int NANO_SECONDS_IN_SEC = 1000000000; // returns a static buffer of struct timespec with the time difference of // ts1 and ts2 ts1 is assumed to be greater than ts2 static struct timespec* timespec_diff(struct timespec* ts1, struct timespec* ts2) { static struct timespec ts; ts.tv_sec = ts1->tv_sec - ts2->tv_sec; ts.tv_nsec = ts1->tv_nsec - ts2->tv_nsec; if (ts.tv_nsec < 0) { ts.tv_sec--; ts.tv_nsec += NANO_SECONDS_IN_SEC; } return &ts; } static void calibrate_ticks() { struct timespec begin_ts, end_ts; printf("Start RDTSC calibration: patience is a virtue\n"); clock_gettime(CLOCK_MONOTONIC, &begin_ts); uint64_t begin = RDTSC(); // do something CPU intensive for (volatile unsigned long long i = 0; i < 1000000000ULL; ++i) ; uint64_t end = RDTSC(); clock_gettime(CLOCK_MONOTONIC, &end_ts); struct timespec* tmp_ts = timespec_diff(&end_ts, &begin_ts); uint64_t ns_elapsed = (uint64_t)(tmp_ts->tv_sec * 1000000000LL + tmp_ts->tv_nsec); g_ticks_per_ns = (double)(end - begin) / (double)ns_elapsed; printf("RDTSC calibration is done (ticks_per_ns: %.2f)\n", g_ticks_per_ns); } // Call once (it is not thread safe) before using RDTSC, has side effect of // binding process to CPU1 static inline void init_rdtsc(uint8_t auto_calibration, double ticks_per_ns) { if (auto_calibration > 0) calibrate_ticks(); else { assert(ticks_per_ns > 0); g_ticks_per_ns = ticks_per_ns; } } static inline void get_timespec(struct timespec* ts, uint64_t nsecs) { ts->tv_sec = nsecs / NANO_SECONDS_IN_SEC; ts->tv_nsec = nsecs % NANO_SECONDS_IN_SEC; } // ts will be filled with time converted from TSC reading static inline void get_rdtsc_timespec(struct timespec* ts) { get_timespec(ts, (uint64_t)(RDTSC() / g_ticks_per_ns)); } static inline double time_elapsed_in_us(struct timespec start) { struct timespec now, *diff; get_rdtsc_timespec(&now); diff = timespec_diff(&now, &start); return diff->tv_sec * 1000000 + diff->tv_nsec / 1000; } static inline double time_elapsed_in_ms(struct timespec start) { struct timespec now, *diff; get_rdtsc_timespec(&now); diff = timespec_diff(&now, &start); return diff->tv_sec * 1000 + diff->tv_nsec / 1000000; } static inline double time_elapsed_in_sec(struct timespec start) { struct timespec now, *diff; get_rdtsc_timespec(&now); diff = timespec_diff(&now, &start); return diff->tv_sec + diff->tv_nsec / NANO_SECONDS_IN_SEC; } #endif // HERMES_TIME_H ================================================ FILE: include/wings/wings.h ================================================ // // Created by akatsarakis on 06/02/19. // #ifndef WINGS_INTERNAL_INLINES_H #define WINGS_INTERNAL_INLINES_H #include "wings_api.h" /// WARNING!! /// Functions starting with underscore (i.e. "_wings_*") /// are internal and should not be called directly void wings_reconfigure_wrs_ah(ud_channel_t* ud_c, uint8_t endpoint_id); /* -------------------------------------------------------------------------- --------------------------------- Helper Functions -------------------------- ---------------------------------------------------------------------------*/ static inline void _wings_assert_binary(uint8_t var) { assert(var == 0 || var == 1); } static inline uint16_t _wings_ud_recv_max_pkt_size(ud_channel_t* ud_c) { if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type != CRD && ud_c->is_header_only == 0); // TODO add assertion that this must be smaller than max_MTU assert(ud_c->max_msg_size > 0 && ud_c->max_coalescing > 0); return sizeof(wings_ud_recv_pkt_t) + ud_c->max_msg_size * ud_c->max_coalescing; } static inline uint16_t _wings_ud_send_max_pkt_size(ud_channel_t* ud_c) { if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type != CRD && ud_c->is_header_only == 0); // TODO add assertion that this must be smaller than max_MTU assert(ud_c->max_msg_size > 0 && ud_c->max_coalescing > 0); return sizeof(wings_ud_send_pkt_t) + ud_c->max_msg_size * ud_c->max_coalescing; } static inline void _wings_assertions(ud_channel_t* ud_channel) { _wings_assert_binary(ud_channel->expl_crd_ctrl); _wings_assert_binary(ud_channel->is_bcast_channel); _wings_assert_binary(ud_channel->is_inlining_enabled); assert(ud_channel->num_channels > 1); assert(ud_channel->max_msg_size > 0); assert(ud_channel->max_coalescing > 0); assert(_wings_ud_send_max_pkt_size(ud_channel) < MAX_MTU_SIZE); assert(ud_channel->send_q_depth > 0 || ud_channel->recv_q_depth > 0); assert(ud_channel->channel_providing_crds != NULL || ud_channel->disable_crd_ctrl); } static inline uint8_t* _wings_get_n_msg_ptr_from_send_pkt(ud_channel_t* ud_c, wings_ud_send_pkt_t* pkt, uint8_t n) { if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type != CRD && ud_c->is_header_only == 0); assert(ud_c->max_coalescing > n && pkt->req_num >= n); // return &pkt->reqs[n * ud_c->max_msg_size]; return &pkt->reqs[n * ud_c->small_msg_size]; } static inline uint8_t* _wings_get_n_msg_ptr_from_recv_pkt(ud_channel_t* ud_c, wings_ud_recv_pkt_t* recv_pkt, uint8_t n) { if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type != CRD && ud_c->is_header_only == 0); return _wings_get_n_msg_ptr_from_send_pkt(ud_c, &recv_pkt->pkt, n); } static inline wings_ud_send_pkt_t* _wings_get_nth_pkt_ptr_from_send_buff(ud_channel_t* ud_c, uint16_t n) { if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type != CRD && ud_c->is_header_only == 0); return (wings_ud_send_pkt_t*)&( (uint8_t*)ud_c->send_pkt_buff)[n * _wings_ud_send_max_pkt_size(ud_c)]; } static inline wings_ud_recv_pkt_t* _wings_get_nth_pkt_ptr_from_recv_buff(ud_channel_t* ud_c, uint16_t n) { if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type != CRD && ud_c->is_header_only == 0); return (wings_ud_recv_pkt_t*)&ud_c ->recv_pkt_buff[n * _wings_ud_recv_max_pkt_size(ud_c)]; } static inline wings_ud_send_pkt_t* _wings_curr_send_pkt_ptr(ud_channel_t* ud_c) { if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type != CRD && ud_c->is_header_only == 0); return _wings_get_nth_pkt_ptr_from_send_buff(ud_c, (uint16_t)ud_c->send_push_ptr); } static inline void _wings_inc_send_push_ptr(ud_channel_t* ud_c) { if (ud_c->is_header_only) return; if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type != CRD && ud_c->is_header_only == 0); if (ud_c->is_bcast_channel) WINGS_MOD_ADD(ud_c->send_push_ptr, ud_c->send_pkt_buff_len); // TODO change this to deal with // failures see comment below // WINGS_MOD_ADD(*inv_push_ptr, INV_SEND_OPS_SIZE / MAX_REMOTE_MACHINES * // last_g_membership.num_of_alive_remotes); // //got to the next "packet" + dealing with // failutes else WINGS_MOD_ADD(ud_c->send_push_ptr, ud_c->send_pkt_buff_len); _wings_curr_send_pkt_ptr(ud_c)->req_num = 0; // Reset data left from previous unicasts / bcasts } static inline void _wings_inc_recv_push_ptr(ud_channel_t* ud_c) { if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type != CRD && ud_c->is_header_only == 0); WINGS_MOD_ADD(ud_c->recv_push_ptr, ud_c->recv_q_depth); } static inline void _wings_inc_recv_pull_ptr(ud_channel_t* ud_c) { if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type != CRD && ud_c->is_header_only == 0); WINGS_MOD_ADD(ud_c->recv_pull_ptr, ud_c->recv_pkt_buff_len); } /* --------------------------------------------------------------------------- ----------------------------------- RECVs ------------------------------------ ---------------------------------------------------------------------------*/ static inline void _wings_post_hdr_only_recvs(ud_channel_t* ud_c, uint16_t num_recvs) { if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->is_header_only || ud_c->type == CRD); struct ibv_recv_wr* bad_recv_wr; for (uint16_t i = 0; i < num_recvs; ++i) ud_c->recv_wr[i].next = (i == num_recvs - 1) ? NULL : &ud_c->recv_wr[i + 1]; int ret = ibv_post_recv(ud_c->qp, ud_c->recv_wr, &bad_recv_wr); CPE(ret, "ibv_post_recv error: posting recvs for credits", ret); } static inline void _wings_post_recvs(ud_channel_t* ud_c, uint16_t num_of_receives) { if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type != CRD && ud_c->is_header_only == 0); void* next_buff_addr; if (WINGS_ENABLE_ASSERTIONS) assert(num_of_receives <= ud_c->max_recv_wrs); int req_size = _wings_ud_recv_max_pkt_size(ud_c); for (int i = 0; i < num_of_receives; ++i) { next_buff_addr = (void*)(ud_c->recv_pkt_buff) + (ud_c->recv_push_ptr * req_size); // TODO optimize by reseting only the req_num of wings_recv_pkt memset(next_buff_addr, 0, (size_t)req_size); // reset the buffer before posting the receive if (WINGS_ENABLE_BATCH_POST_RECVS_TO_NIC) ud_c->recv_wr[i].sg_list->addr = (uintptr_t)next_buff_addr; else assert(0); // hrd_post_dgram_recv(ud_c->qp, next_buff_addr, req_size, // cb->dgram_buf_mr->lkey); _wings_inc_recv_push_ptr(ud_c); } if (WINGS_ENABLE_BATCH_POST_RECVS_TO_NIC) { ud_c->recv_wr[num_of_receives - 1].next = NULL; if (WINGS_ENABLE_ASSERTIONS) { for (int i = 0; i < num_of_receives; i++) { assert(ud_c->recv_wr[i].num_sge == 1); assert(ud_c->recv_wr[i].sg_list->length == req_size); // TODO add // assert(ud_c->recv_wr[i].sg_list->lkey == // cb->dgram_buf_mr->lkey); assert(i == num_of_receives - 1 || ud_c->recv_wr[i].next == &ud_c->recv_wr[i + 1]); } assert(ud_c->recv_wr[num_of_receives - 1].next == NULL); } struct ibv_recv_wr* bad_recv_wr; int ret = ibv_post_recv(ud_c->qp, ud_c->recv_wr, &bad_recv_wr); CPE(ret, "ibv_post_recv error: while posting recvs", ret); // recover next ptr of last wr to NULL ud_c->recv_wr[num_of_receives - 1].next = (ud_c->max_recv_wrs == num_of_receives - 1) ? NULL : &ud_c->recv_wr[num_of_receives]; } } static inline void _wings_poll_crds_and_post_recvs(ud_channel_t* ud_c) { if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type == CRD); int crd_pkts_found = ibv_poll_cq(ud_c->recv_cq, ud_c->max_recv_wrs, ud_c->recv_wc); if (crd_pkts_found > 0) { if (unlikely(ud_c->recv_wc[crd_pkts_found - 1].status != 0)) { fprintf(stderr, "Bad wc status when polling for credits to send a broadcast %d\n", ud_c->recv_wc[crd_pkts_found - 1].status); exit(0); } if (ud_c->enable_stats) ud_c->stats.recv_total_pkts += crd_pkts_found; if (WINGS_ENABLE_RECV_PRINTS && ud_c->enable_prints) colored_printf(GREEN, "^^^ Polled reqs: %s %d, (total: %d)!\n", ud_c->qp_name, crd_pkts_found, ud_c->stats.recv_total_pkts); for (int i = 0; i < crd_pkts_found; i++) { wings_crd_t* crd_ptr = (wings_crd_t*)&ud_c->recv_wc[i].imm_data; if (ud_c->enable_stats) ud_c->stats.recv_total_msgs += crd_ptr->crd_num; ud_c->channel_providing_crds->credits_per_channels[crd_ptr->sender_id] += crd_ptr->crd_num; if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->channel_providing_crds->num_crds_per_channel >= ud_c->channel_providing_crds ->credits_per_channels[crd_ptr->sender_id]); if (WINGS_ENABLE_CREDIT_PRINTS && ud_c->enable_prints) printf( "$$$ Credits: %s \033[1m\033[32mincremented\033[0m to %d (for " "endpoint %d)\n", ud_c->channel_providing_crds->qp_name, ud_c->channel_providing_crds ->credits_per_channels[crd_ptr->sender_id], crd_ptr->sender_id); } if (WINGS_ENABLE_POST_RECV_PRINTS && ud_c->enable_prints) colored_printf(YELLOW, "vvv Post Receives: %s %d\n", ud_c->qp_name, crd_pkts_found); _wings_post_hdr_only_recvs(ud_c, (uint16_t)crd_pkts_found); } else if (unlikely(crd_pkts_found < 0)) { printf("ERROR In the credit CQ\n"); exit(0); } } static inline void _wings_enque_to_overflown_msgs(ud_channel_t* ud_c, uint8_t* msg_ptr) { if (WINGS_ENABLE_ASSERTIONS) { assert(ud_c->is_header_only == 0); assert(ud_c->enable_overflow_msgs); assert(ud_c->num_overflow_msgs < ud_c->max_coalescing); } uint8_t* dst_ptr = &ud_c->overflow_msg_buff[ud_c->num_overflow_msgs * ud_c->max_msg_size]; memcpy(dst_ptr, msg_ptr, ud_c->max_msg_size); ud_c->num_overflow_msgs++; } static inline uint16_t _wings_deque_from_overflown_msgs(ud_channel_t* ud_c, uint16_t max_msgs_to_poll, uint8_t* recv_ops) { if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->is_header_only == 0); uint8_t msgs_to_copy = (uint8_t)(ud_c->num_overflow_msgs <= max_msgs_to_poll ? ud_c->num_overflow_msgs : max_msgs_to_poll); if (ud_c->num_overflow_msgs > 0) { ud_c->num_overflow_msgs -= msgs_to_copy; // Copy msgs from overflow_buff to recv_ops memcpy(recv_ops, ud_c->overflow_msg_buff, msgs_to_copy * ud_c->max_msg_size); if (msgs_to_copy == max_msgs_to_poll) // Move rest of overflown msgs to the top of the (FIFO) buffer for (int i = 0; i < ud_c->num_overflow_msgs; ++i) { uint8_t* dst_ptr = &ud_c->overflow_msg_buff[ud_c->max_msg_size * i]; uint8_t* src_ptr = &ud_c->overflow_msg_buff[ud_c->max_msg_size * (i + msgs_to_copy)]; memcpy(dst_ptr, src_ptr, ud_c->max_msg_size); } } return msgs_to_copy; } static inline uint16_t wings_poll_buff_and_post_recvs(ud_channel_t* ud_c, uint16_t max_msgs_to_poll, uint8_t* recv_ops) { if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type != CRD); int index = 0; uint8_t sender = 0; uint16_t msgs_polled = 0; uint8_t *next_packet_reqs, *recv_op_ptr, *next_req, *next_packet_req_num_ptr; uint16_t dequed_msgs = 0; uint16_t remaining_msgs_to_poll = max_msgs_to_poll; if (max_msgs_to_poll < 1) return 0; if (ud_c->enable_overflow_msgs) { dequed_msgs = _wings_deque_from_overflown_msgs(ud_c, max_msgs_to_poll, recv_ops); if (max_msgs_to_poll == dequed_msgs) return max_msgs_to_poll; recv_ops = &recv_ops[dequed_msgs * ud_c->max_msg_size]; remaining_msgs_to_poll -= dequed_msgs; } uint16_t max_pkts_to_poll = (uint16_t)((remaining_msgs_to_poll / ud_c->max_coalescing) + (ud_c->enable_overflow_msgs ? 1 : 0)); // poll completion q uint16_t pkts_polled = (uint16_t)ibv_poll_cq(ud_c->recv_cq, max_pkts_to_poll, ud_c->recv_wc); for (int i = 0; i < pkts_polled; ++i) { if (ud_c->is_header_only) { recv_op_ptr = &recv_ops[i * ud_c->max_msg_size]; memcpy(recv_op_ptr, &ud_c->recv_wc[i].imm_data, ud_c->max_msg_size); msgs_polled++; sender = ((wings_hdr_only_t*)&ud_c->recv_wc[i].imm_data)->sender_id; if (!ud_c->disable_crd_ctrl) ud_c->channel_providing_crds ->credits_per_channels[sender]++; // increment packet credits } else { uint16_t max_req_size = _wings_ud_recv_max_pkt_size(ud_c); index = (ud_c->recv_pull_ptr + 1) % ud_c->recv_q_depth; wings_ud_recv_pkt_t* next_packet = (wings_ud_recv_pkt_t*)&ud_c->recv_pkt_buff[index * max_req_size]; sender = next_packet->pkt.sender_id; next_packet_reqs = next_packet->pkt.reqs; next_packet_req_num_ptr = &next_packet->pkt.req_num; if (WINGS_ENABLE_ASSERTIONS) assert(next_packet->pkt.req_num > 0 && next_packet->pkt.req_num <= ud_c->max_coalescing); // TODO add membership and functionality // if(node_is_in_membership(last_group_membership, sender)) uint16_t msg_size = next_packet->pkt.only_small_msgs == 1 ? ud_c->small_msg_size : ud_c->max_msg_size; for (int j = 0; j < next_packet->pkt.req_num; ++j) { next_req = &next_packet_reqs[j * msg_size]; if (msgs_polled >= remaining_msgs_to_poll) _wings_enque_to_overflown_msgs(ud_c, next_req); else { recv_op_ptr = &recv_ops[msgs_polled * ud_c->max_msg_size]; memcpy(recv_op_ptr, next_req, msg_size); } msgs_polled++; if (!ud_c->disable_crd_ctrl) ud_c->channel_providing_crds ->credits_per_channels[sender]++; // increment packet credits } *next_packet_req_num_ptr = 0; // TODO can be removed since we already reset on posting receives _wings_inc_recv_pull_ptr(ud_c); } if (WINGS_ENABLE_ASSERTIONS) if (!ud_c->disable_crd_ctrl) assert(ud_c->channel_providing_crds->credits_per_channels[sender] <= ud_c->channel_providing_crds->num_crds_per_channel); } if (pkts_polled > 0) { // Refill recvs if (ud_c->is_header_only) _wings_post_hdr_only_recvs(ud_c, pkts_polled); else _wings_post_recvs(ud_c, pkts_polled); if (WINGS_ENABLE_STAT_COUNTING) { ud_c->stats.recv_total_msgs += msgs_polled; ud_c->stats.recv_total_pkts += pkts_polled; } if (WINGS_ENABLE_RECV_PRINTS && ud_c->enable_prints) colored_printf( GREEN, "^^^ Polled msgs: %d packets %s %d, (total pkts: %d, msgs %d)!\n", pkts_polled, ud_c->qp_name, msgs_polled, ud_c->stats.recv_total_pkts, ud_c->stats.recv_total_msgs); if (WINGS_ENABLE_CREDIT_PRINTS && ud_c->enable_prints && !ud_c->disable_crd_ctrl) printf( "$$$ Credits: %s \033[1m\033[32mincremented\033[0m to %d (for " "machine %d)\n", ud_c->channel_providing_crds->qp_name, ud_c->channel_providing_crds->credits_per_channels[sender], sender); if (WINGS_ENABLE_POST_RECV_PRINTS && ud_c->enable_prints) colored_printf(YELLOW, "vvv Post Receives: %s %d\n", ud_c->qp_name, pkts_polled); if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->max_coalescing != 1 || pkts_polled == msgs_polled); } return msgs_polled + dequed_msgs >= max_msgs_to_poll ? max_msgs_to_poll : msgs_polled + dequed_msgs; } /* --------------------------------------------------------------------------- ----------------------------------- CREDITS ---------------------------------- ---------------------------------------------------------------------------*/ static inline uint8_t _wings_node_is_in_membership(uint8_t node_id, bit_vector_t membership) { if (WINGS_ENABLE_ASSERTIONS) assert(node_id < 8); return bv_bit_get(membership, node_id) == 1 ? 1 : 0; } // For all the CREDIT functions --> if its a bcast channel endpoint_id is // ignored static inline uint8_t _wings_has_sufficient_crds_no_polling_membership(ud_channel_t* ud_c, uint8_t endpoint_id, bit_vector_t* membership) { uint8_t check_membership = membership == NULL ? 0 : 1; if (ud_c->disable_crd_ctrl) return 1; else if (!ud_c->is_bcast_channel) return (uint8_t)(ud_c->credits_per_channels[endpoint_id] > 0); else for (int i = 0; i < ud_c->num_channels; ++i) { if (i == ud_c->channel_id) continue; if (check_membership == 1 && !_wings_node_is_in_membership(i, *membership)) continue; // skip machine if not in membership if (ud_c->credits_per_channels[i] <= 0) return 0; } return 1; } // For all the CREDIT functions --> if its a bcast channel endpoint_id is // ignored static inline uint8_t _wings_has_sufficient_crds_no_polling(ud_channel_t* ud_c, uint8_t endpoint_id) { return _wings_has_sufficient_crds_no_polling_membership(ud_c, endpoint_id, NULL); } static inline uint8_t _wings_has_sufficient_crds_membership(ud_channel_t* ud_c, uint8_t endpoint_id, bit_vector_t* membership) { if (_wings_has_sufficient_crds_no_polling_membership(ud_c, endpoint_id, membership)) return 1; if (ud_c->expl_crd_ctrl) { _wings_poll_crds_and_post_recvs(ud_c->channel_providing_crds); if (_wings_has_sufficient_crds_no_polling_membership(ud_c, endpoint_id, membership)) return 1; } return 0; } static inline uint8_t _wings_has_sufficient_crds(ud_channel_t* ud_c, uint8_t endpoint_id) { if (_wings_has_sufficient_crds_no_polling(ud_c, endpoint_id)) return 1; if (ud_c->expl_crd_ctrl) { _wings_poll_crds_and_post_recvs(ud_c->channel_providing_crds); if (_wings_has_sufficient_crds_no_polling(ud_c, endpoint_id)) return 1; } return 0; } static inline void _wings_dec_crds_membership(ud_channel_t* ud_c, uint8_t endpoint_id, bit_vector_t* membership) { uint8_t check_membership = membership == NULL ? 0 : 1; if (ud_c->disable_crd_ctrl) return; if (WINGS_ENABLE_ASSERTIONS) assert(_wings_has_sufficient_crds_no_polling_membership(ud_c, endpoint_id, membership)); if (!ud_c->is_bcast_channel) ud_c->credits_per_channels[endpoint_id]--; else for (int i = 0; i < ud_c->num_channels; ++i) { if (i == ud_c->channel_id) continue; if (check_membership == 1 && !_wings_node_is_in_membership(i, *membership)) continue; // skip machine if not in membership ud_c->credits_per_channels[i]--; } if (WINGS_ENABLE_CREDIT_PRINTS && ud_c->enable_prints) { if (ud_c->is_bcast_channel) endpoint_id = (uint8_t)(ud_c->channel_id == 0 ? 1 : 0); printf("$$$ Credits: %s \033[31mdecremented\033[0m to %d", ud_c->qp_name, ud_c->credits_per_channels[endpoint_id]); if (ud_c->is_bcast_channel) printf(" (all endpoints)\n"); else printf(" (for endpoint %d)\n", endpoint_id); } } static inline void _wings_dec_crds(ud_channel_t* ud_c, uint8_t endpoint_id) { _wings_dec_crds_membership(ud_c, endpoint_id, NULL); } static inline void wings_reset_credits(ud_channel_t* ud_c, uint8_t endpoint_id) { ud_c->credits_per_channels[endpoint_id] = (uint16_t)ud_c->channel_providing_crds->num_crds_per_channel; } /* --------------------------------------------------------------------------- ----------------------------------- SENDs ------------------------------------ ---------------------------------------------------------------------------*/ static inline void _wings_forge_crd_wr(ud_channel_t* ud_c, uint16_t dst_qp_id, uint16_t crd_pkts_to_send, uint16_t crd_to_send) { if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type == CRD); ud_c->send_wr[crd_pkts_to_send].send_flags = IBV_SEND_INLINE; ud_c->send_wr[crd_pkts_to_send].wr.ud.ah = ud_c->remote_qps[dst_qp_id].ah; ud_c->send_wr[crd_pkts_to_send].wr.ud.remote_qpn = ud_c->remote_qps[dst_qp_id].qpn; ((wings_crd_t*)&ud_c->send_wr[crd_pkts_to_send].imm_data)->crd_num = crd_to_send; if (ud_c->enable_stats) ud_c->stats.send_total_msgs += crd_to_send; if (crd_pkts_to_send > 0) ud_c->send_wr[crd_pkts_to_send - 1].next = &ud_c->send_wr[crd_pkts_to_send]; // Selective Signaling --> Do a Signaled Send every ss_granularity pkts if (ud_c->total_pkts_send % ud_c->ss_granularity == 0) { // if not the first SS --> poll the previous SS completion if (ud_c->total_pkts_send > 0) { struct ibv_wc signal_send_wc; hrd_poll_cq(ud_c->send_cq, 1, &signal_send_wc); if (ud_c->enable_stats) ud_c->stats.ss_completions++; if (WINGS_ENABLE_SS_PRINTS && ud_c->enable_prints) colored_printf(RED, "^^^ Polled SS completion: %s %d (total %d)\n", ud_c->qp_name, 1, ud_c->stats.ss_completions); } ud_c->send_wr[crd_pkts_to_send].send_flags |= IBV_SEND_SIGNALED; if (WINGS_ENABLE_SS_PRINTS && ud_c->enable_prints) colored_printf(RED, "vvv Send SS: %s\n", ud_c->qp_name); } ud_c->total_pkts_send++; } static inline void _wings_forge_wr(ud_channel_t* ud_c, uint8_t dst_qp_id, uint8_t* req_to_copy, uint16_t pkts_in_batch, uint16_t* msgs_in_batch, copy_and_modify_input_elem_t copy_and_modify_elem, uint8_t is_small_msg) // dst_qp_id is ignored if its a bcast channel { struct ibv_wc signal_send_wc; uint8_t curr_req_num = 1; uint8_t* next_req_ptr; if (ud_c->is_header_only) next_req_ptr = ((wings_hdr_only_t*)&ud_c->send_wr[pkts_in_batch].imm_data) ->inlined_payload; else { wings_ud_send_pkt_t* curr_pkt_ptr = _wings_curr_send_pkt_ptr(ud_c); next_req_ptr = _wings_get_n_msg_ptr_from_send_pkt(ud_c, curr_pkt_ptr, curr_pkt_ptr->req_num); curr_req_num = ++curr_pkt_ptr->req_num; curr_pkt_ptr->sender_id = ud_c->channel_id; uint16_t msg_size = is_small_msg == 1 ? ud_c->small_msg_size : ud_c->max_msg_size; ud_c->send_sgl[pkts_in_batch].length = sizeof(wings_ud_send_pkt_t) + // ud_c->max_msg_size * // curr_pkt_ptr->req_num; msg_size * curr_pkt_ptr->req_num; if (WINGS_ENABLE_ASSERTIONS) assert(is_small_msg == 1 || curr_req_num == 1); // we only do coalescing for small msgs if (curr_req_num == 1) { ud_c->send_sgl[pkts_in_batch].addr = (uint64_t)curr_pkt_ptr; #if WINGS_ENABLE_TWO_MSG_SIZES == 1 curr_pkt_ptr->only_small_msgs = is_small_msg == 1 ? 1 : 0; #endif } } // --> callback func that copies and manipulated data // from req_to_copy buff copy_and_modify_elem(next_req_ptr, req_to_copy); if (WINGS_ENABLE_ASSERTIONS) { assert(dst_qp_id != machine_id || ud_c->is_bcast_channel); assert(curr_req_num <= ud_c->max_coalescing); } if (ud_c->enable_stats) ud_c->stats.send_total_msgs++; if (curr_req_num == 1) { if (!ud_c->is_bcast_channel) { // set the dst qp ud_c->send_wr[pkts_in_batch].wr.ud.ah = ud_c->remote_qps[dst_qp_id].ah; ud_c->send_wr[pkts_in_batch].wr.ud.remote_qpn = ud_c->remote_qps[dst_qp_id].qpn; } uint16_t wr_idx = (uint16_t)(pkts_in_batch * (ud_c->is_bcast_channel ? ud_c->num_channels - 1 : 1)); ud_c->send_wr[wr_idx].send_flags = ud_c->is_inlining_enabled ? IBV_SEND_INLINE : 0; if (wr_idx > 0) // set previous send_wr to point to curr ud_c->send_wr[wr_idx - 1].next = &ud_c->send_wr[wr_idx]; // Selective Signaling --> Do a Signaled Send every ss_granularity pkts if (ud_c->total_pkts_send % ud_c->ss_granularity == 0) { // if not the first SS --> poll the previous SS completion if (ud_c->total_pkts_send > 0) { hrd_poll_cq(ud_c->send_cq, 1, &signal_send_wc); if (ud_c->enable_stats) ud_c->stats.ss_completions++; if (WINGS_ENABLE_SS_PRINTS && ud_c->enable_prints) colored_printf(RED, "^^^ Polled SS completion: %s %d (total %d)\n", ud_c->qp_name, 1, ud_c->stats.ss_completions); } ud_c->send_wr[wr_idx].send_flags |= IBV_SEND_SIGNALED; if (WINGS_ENABLE_SS_PRINTS && ud_c->enable_prints) colored_printf(RED, "vvv Send SS: %s\n", ud_c->qp_name); } ud_c->total_pkts_send++; } (*msgs_in_batch)++; } static inline void _wings_batch_pkts_2_NIC(ud_channel_t* ud_c, uint16_t pkts_in_batch, uint16_t msgs_in_batch) { int ret; struct ibv_send_wr* bad_send_wr; if (ud_c->enable_stats) ud_c->stats.send_total_pkts += pkts_in_batch; uint16_t remote_channels = (uint16_t)(ud_c->num_channels - 1); uint16_t wr_idx = (uint16_t)(pkts_in_batch * (ud_c->is_bcast_channel ? remote_channels : 1)); ud_c->send_wr[wr_idx - 1].next = NULL; if (WINGS_ENABLE_ASSERTIONS) { assert(pkts_in_batch <= ud_c->max_send_wrs); assert(pkts_in_batch <= ud_c->send_pkt_buff_len); assert(ud_c->type == CRD || ud_c->max_coalescing > 1 || msgs_in_batch == pkts_in_batch); assert(ud_c->type == CRD || ud_c->max_coalescing > 1 || ud_c->stats.send_total_msgs == ud_c->stats.send_total_pkts); assert(ud_c->send_wr[wr_idx - 1].next == NULL); for (int i = 0; i < wr_idx; ++i) { uint16_t sgl_idx = (uint16_t)(i / (ud_c->is_bcast_channel ? remote_channels : 1)); if (ud_c->type != CRD && !ud_c->is_header_only) { assert(ud_c->send_wr[i].num_sge == 1); assert(ud_c->send_wr[i].opcode == IBV_WR_SEND); assert(ud_c->send_wr[i].sg_list == &ud_c->send_sgl[sgl_idx]); wings_ud_send_pkt_t* curr_send_pkt = (wings_ud_send_pkt_t*)ud_c->send_sgl[sgl_idx].addr; assert(curr_send_pkt->req_num > 0); } else { assert(ud_c->send_wr[i].num_sge == 0); assert(ud_c->send_wr[i].sg_list->length == 0); assert(ud_c->send_wr[i].opcode == IBV_WR_SEND_WITH_IMM); if (ud_c->type == CRD) { assert(((wings_crd_t*)&(ud_c->send_wr[i].imm_data))->crd_num > 0); assert(((wings_crd_t*)&(ud_c->send_wr[i].imm_data))->sender_id == ud_c->channel_id); } else assert(((wings_hdr_only_t*)&(ud_c->send_wr[i].imm_data))->sender_id == ud_c->channel_id); } assert(ud_c->send_wr[i].wr.ud.remote_qkey == HRD_DEFAULT_QKEY); assert(i == wr_idx - 1 || ud_c->send_wr[i].next == &ud_c->send_wr[i + 1]); assert(!ud_c->is_inlining_enabled || ud_c->send_wr[i].send_flags == IBV_SEND_INLINE || ud_c->send_wr[i].send_flags == (IBV_SEND_INLINE | IBV_SEND_SIGNALED)); } } if (WINGS_ENABLE_SEND_PRINTS && ud_c->enable_prints) // TODO make this work w/ bcasts colored_printf(CYAN, ">>> Send: %d packets %s %d (Total packets: %d, msgs: %d)\n", pkts_in_batch, ud_c->qp_name, msgs_in_batch, ud_c->stats.send_total_pkts, ud_c->stats.send_total_msgs); ret = ibv_post_send(ud_c->qp, ud_c->send_wr, &bad_send_wr); CPE(ret, "ibv_post_send error while sending msgs to the NIC", ret); } static inline void _wings_check_if_batch_n_inc_pkt_ptr(ud_channel_t* ud_c, uint16_t* pkts_in_batch_ptr, uint16_t* msgs_in_batch_ptr) { (*pkts_in_batch_ptr)++; uint16_t send_pkts = *pkts_in_batch_ptr; uint16_t total_msgs_in_batch = *msgs_in_batch_ptr; uint16_t max_pkt_batch = ud_c->is_bcast_channel ? ud_c->max_pcie_bcast_batch : ud_c->max_send_wrs; if (send_pkts == max_pkt_batch) { _wings_batch_pkts_2_NIC(ud_c, send_pkts, total_msgs_in_batch); *pkts_in_batch_ptr = 0; *msgs_in_batch_ptr = 0; } _wings_inc_send_push_ptr(ud_c); // go to the next pkt } static inline uint8_t wings_set_sender_id_n_msg_type(uint8_t sender_id, uint8_t is_small_msg) { if (WINGS_ENABLE_ASSERTIONS) { assert(sender_id < 128); assert(is_small_msg == 0 || is_small_msg == 1); } return (is_small_msg == 0) ? sender_id + 128 : sender_id; } static inline uint8_t _wings_get_sender_id_n_msg_type(uint8_t skip_or_sender_id, uint8_t* is_small_msg) { if (WINGS_ENABLE_ASSERTIONS) assert(skip_or_sender_id < 258); *is_small_msg = (skip_or_sender_id >= 128) ? 0 : 1; return (skip_or_sender_id >= 128) ? skip_or_sender_id - 128 : skip_or_sender_id; } static inline uint8_t wings_issue_pkts(ud_channel_t* ud_c, bit_vector_t* membership, uint8_t* input_array_of_elems, uint16_t input_array_len, uint16_t size_of_input_elems, uint16_t* input_array_rolling_idx, skip_input_elem_or_get_dst_id_t skip_or_get_sender_id_func_ptr, modify_input_elem_after_send_t modify_elem_after_send, copy_and_modify_input_elem_t copy_and_modify_elem) { uint8_t curr_msg_dst; uint8_t is_small_msg = 0; uint8_t last_msg_dst = 255; uint8_t has_outstanding_msgs = 0; uint16_t msgs_in_batch = 0, pkts_in_batch = 0, idx = 0; if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->is_header_only || _wings_curr_send_pkt_ptr(ud_c)->req_num == 0); for (int i = 0; i < input_array_len; i++) { idx = (uint16_t)(input_array_rolling_idx == NULL ? i : (i + *input_array_rolling_idx) % input_array_len); // Skip or Respond (copy and send ?) uint8_t* curr_elem = &input_array_of_elems[idx * size_of_input_elems]; int skip_or_sender_id = skip_or_get_sender_id_func_ptr(curr_elem); if (skip_or_sender_id < 0) continue; if (WINGS_ENABLE_ASSERTIONS) assert(skip_or_sender_id < 258); curr_msg_dst = _wings_get_sender_id_n_msg_type(skip_or_sender_id, &is_small_msg); if (ud_c->is_header_only) is_small_msg = 1; // Break if we do not have sufficient credits if (!_wings_has_sufficient_crds_membership(ud_c, curr_msg_dst, membership)) { has_outstanding_msgs = 1; if (ud_c->enable_stats) ud_c->stats.no_stalls_due_to_credits++; if (input_array_rolling_idx != NULL) *input_array_rolling_idx = idx; break; // we need to break for broadcast (lets assume it is ok to break // for unicasts as well since it may only harm perf) } _wings_dec_crds_membership(ud_c, curr_msg_dst, membership); if ((!ud_c->is_bcast_channel && !ud_c->is_header_only) || is_small_msg == 0) { // Send unicasts because if we cannot coalesce pkts, due to different // endpoints if (_wings_curr_send_pkt_ptr(ud_c)->req_num > 0 && (is_small_msg == 0 || curr_msg_dst != last_msg_dst)) _wings_check_if_batch_n_inc_pkt_ptr(ud_c, &pkts_in_batch, &msgs_in_batch); } last_msg_dst = curr_msg_dst; // Create the messages _wings_forge_wr(ud_c, curr_msg_dst, curr_elem, pkts_in_batch, &msgs_in_batch, copy_and_modify_elem, is_small_msg); modify_elem_after_send(curr_elem); // E.g. Change the state of the element // which triggered a send // Check if we should send a batch since we might have reached the max batch // size if (is_small_msg == 0 || ud_c->is_header_only || _wings_curr_send_pkt_ptr(ud_c)->req_num == ud_c->max_coalescing) { _wings_check_if_batch_n_inc_pkt_ptr(ud_c, &pkts_in_batch, &msgs_in_batch); } } // Even if the last pkt is not full do the appropriate actions and incl to NIC // batch wings_ud_send_pkt_t* curr_pkt_ptr = NULL; if (!ud_c->is_header_only && is_small_msg == 1) { curr_pkt_ptr = _wings_curr_send_pkt_ptr(ud_c); if (curr_pkt_ptr->req_num > 0 && curr_pkt_ptr->req_num < ud_c->max_coalescing) pkts_in_batch++; } // Force a batch to send the last set of requests (even < max batch size) if (pkts_in_batch > 0) _wings_batch_pkts_2_NIC(ud_c, pkts_in_batch, msgs_in_batch); if (!ud_c->is_header_only && is_small_msg == 1) // Move to next packet and reset data left from previous bcasts/unicasts if (curr_pkt_ptr->req_num > 0 && curr_pkt_ptr->req_num < ud_c->max_coalescing) _wings_inc_send_push_ptr(ud_c); return has_outstanding_msgs; } static inline void wings_issue_credits( ud_channel_t* ud_c, bit_vector_t* membership, uint8_t* input_array_of_elems, uint16_t input_array_len, uint16_t size_of_input_elems, skip_input_elem_or_get_dst_id_t skip_or_get_sender_id_func_ptr, modify_input_elem_after_send_t modify_elem_after_send) { if (WINGS_ENABLE_ASSERTIONS) assert(ud_c->type == CRD); for (int i = 0; i < ud_c->num_channels; ++i) ud_c->no_crds_to_send_per_endpoint[i] = 0; for (int i = 0; i < input_array_len; ++i) { // Skip or Respond (copy and send ?) uint8_t* curr_elem = &input_array_of_elems[i * size_of_input_elems]; int skip_or_sender_id = skip_or_get_sender_id_func_ptr(curr_elem); if (WINGS_ENABLE_ASSERTIONS) assert(skip_or_sender_id < 255); if (skip_or_sender_id < 0) continue; uint8_t curr_msg_dst = (uint8_t)skip_or_sender_id; // Check if we have sufficient credits --> (we should always have enough // credits for CRDs) if (!_wings_has_sufficient_crds_membership(ud_c, curr_msg_dst, membership)) assert(0); if (ud_c->no_crds_to_send_per_endpoint[curr_msg_dst] == 0 && ud_c->credits_per_channels[curr_msg_dst] == 0) assert(0); _wings_dec_crds_membership(ud_c, curr_msg_dst, membership); ud_c->no_crds_to_send_per_endpoint[curr_msg_dst]++; modify_elem_after_send(curr_elem); // E.g. Change the state of the element // which triggered a send } uint16_t send_crd_packets = 0, total_credits_to_send = 0; for (uint16_t i = 0; i < ud_c->num_channels; ++i) { if (i == ud_c->channel_id) continue; if (ud_c->no_crds_to_send_per_endpoint[i] > 0) { _wings_forge_crd_wr(ud_c, i, send_crd_packets, ud_c->no_crds_to_send_per_endpoint[i]); send_crd_packets++; total_credits_to_send += ud_c->no_crds_to_send_per_endpoint[i]; if (send_crd_packets == ud_c->max_send_wrs) { _wings_batch_pkts_2_NIC(ud_c, send_crd_packets, total_credits_to_send); send_crd_packets = 0; total_credits_to_send = 0; } } } if (send_crd_packets > 0) _wings_batch_pkts_2_NIC(ud_c, send_crd_packets, total_credits_to_send); } #endif // WINGS_INTERNAL_INLINES_H ================================================ FILE: include/wings/wings_api.h ================================================ // // Created by akatsarakis on 06/02/19. // #ifndef WINGS_API_H #define WINGS_API_H #include "../utils/bit_vector.h" #include "hrd.h" /// WARNING!! /// Accessible functions not defined below (in wings_api.h but exist only in /// wings.h) and starting with underscore /// (i.e. "_wings_*") are internal and should not be called directly /// by the application #define WINGS_ENABLE_ASSERTIONS 0 #define WINGS_MAX_SUPPORTED_INLINING 187 #define WINGS_ENABLE_BATCH_POST_RECVS_TO_NIC 1 #define WINGS_ENABLE_STAT_COUNTING 1 #define WINGS_MIN_PCIE_BCAST_BATCH 1 #define WINGS_MIN(x, y) (x < y ? x : y) #define WINGS_ENABLE_PRINTS 0 #define WINGS_ENABLE_SS_PRINTS (1 && WINGS_ENABLE_PRINTS) #define WINGS_ENABLE_SEND_PRINTS (1 && WINGS_ENABLE_PRINTS) #define WINGS_ENABLE_RECV_PRINTS (1 && WINGS_ENABLE_PRINTS) #define WINGS_ENABLE_CREDIT_PRINTS (1 && WINGS_ENABLE_PRINTS) #define WINGS_ENABLE_POST_RECV_PRINTS (1 && WINGS_ENABLE_PRINTS) #define WINGS_IS_ROCE 0 #define MAX_MTU_SIZE 4096 /* Useful when `x = (x + 1) % N` is done in a loop */ #define WINGS_MOD_ADD(x, N) \ do { \ x = x + 1; \ if (x == N) x = 0; \ } while (0) /* ah pointer and qpn are accessed together in the critical path so we are putting them in the same cache line */ typedef struct { struct ibv_ah* ah; uint32_t qpn; // no padding needed- false sharing is not an issue, only fragmentation } qp_info_t; typedef struct { uint8_t only_small_msgs : 1; // support for up to 256 unique senders per // instance (e.g. thread) uint8_t sender_id : 7; // support for up to 128 unique senders per instance // (e.g. thread) uint8_t req_num; // <= max_coalescing of a channel uint8_t reqs[]; // sizeof(req_num * req_size) } wings_pkt_t, wings_ud_send_pkt_t; // Packets with GRH typedef struct { struct ibv_grh grh; wings_pkt_t pkt; } __attribute__((packed)) wings_ud_recv_pkt_t; // rcved rdma ud pkts come with a grh padding typedef struct { uint8_t sender_id; // support for up to 256 unique senders per instance (e.g. // thread) uint16_t crd_num; // credit num } __attribute__((packed)) wings_crd_t; // always send as inlined_payload typedef struct { uint8_t sender_id; // support for up to 256 unique senders per instance (e.g. // thread) uint8_t inlined_payload[3]; // available space to be used by the application } __attribute__((packed)) wings_hdr_only_t; // always send as inlined_payload static_assert(sizeof(wings_hdr_only_t) == 4 * sizeof(uint8_t), ""); typedef struct { uint64_t send_total_msgs; uint64_t send_total_pkts; uint64_t send_total_pcie_batches; uint64_t ss_completions; uint64_t recv_total_msgs; uint64_t recv_total_pkts; uint64_t no_stalls_due_to_credits; // number of stalls due to not enough credits } ud_channel_stats_t; enum channel_type { REQ, RESP, CRD }; typedef struct _ud_channel_t { struct ibv_qp* qp; enum channel_type type; uint8_t max_coalescing; uint8_t expl_crd_ctrl; uint8_t disable_crd_ctrl; uint8_t is_header_only; uint8_t is_bcast_channel; uint8_t is_inlining_enabled; struct _ud_channel_t* channel_providing_crds; char* qp_name; uint16_t qp_id; // id of qp in cb uint16_t max_msg_size; uint16_t small_msg_size; uint8_t channel_id; // id of the curr channel (e.g. local node id) uint16_t num_channels; // e.g. remote nodes + local node uint16_t num_crds_per_channel; uint16_t* credits_per_channels; // array size of num_channels denoting // available space on remote sides /// Credits refer to msgs irrespective if coalesed or not --> a remote buffer /// must be able to handle max_number_of_msgs * max_coalescing volatile uint8_t* recv_pkt_buff; /// Intermediate buffs where reqs are copied /// when pkts are received wings_ud_send_pkt_t* send_pkt_buff; /// Intermediate buffs where reqs are /// copied when pkts are send uint16_t send_pkt_buff_len; uint16_t recv_pkt_buff_len; uint16_t max_send_wrs; uint16_t max_recv_wrs; uint16_t send_q_depth; uint16_t recv_q_depth; uint16_t ss_granularity; // selective signaling granularity uint16_t max_pcie_bcast_batch; uint64_t total_pkts_send; // used for selective signaling int send_push_ptr; int recv_push_ptr; int recv_pull_ptr; struct ibv_send_wr* send_wr; struct ibv_recv_wr* recv_wr; // Used only to batch post recvs to the NIC struct ibv_sge* send_sgl; struct ibv_sge* recv_sgl; // Used only to batch post recvs to the NIC struct ibv_cq* send_cq; struct ibv_cq* recv_cq; struct ibv_wc* recv_wc; // (size of max_recv_wrs) Used on polling recv req cq // (only for immediates) /// Send wcs are omitted since they are only used for selective signaling /// (within send function calls) struct ibv_mr* send_mem_region; // NULL if inlining is enabled struct ibv_pd* pd; // A protection domain for this ud channel // Remote QPs qp_info_t* remote_qps; // Used only for type == CRD uint16_t* no_crds_to_send_per_endpoint; // Stats ud_channel_stats_t stats; uint8_t enable_overflow_msgs; uint8_t num_overflow_msgs; // msgs in overflow_msg_buff always <= // max_coalescing - 1 uint8_t* overflow_msg_buff; // use to keep message in case of polling // a pkt and it doesn't fit in the recv array we // Toggles uint8_t enable_stats; uint8_t enable_prints; } ud_channel_t; // Define some function pointers used when issuing pkts typedef void (*modify_input_elem_after_send_t)(uint8_t*); typedef int (*skip_input_elem_or_get_dst_id_t)( uint8_t*); // Should return -1 to skip otherwise returns the sender id typedef void (*copy_and_modify_input_elem_t)(uint8_t* msg_to_send, uint8_t* triggering_req); static inline void wings_NOP_modify_elem_after_send(uint8_t* req) { /*Do not change anything*/ } /// Init and Util functions void wings_print_ud_c_overview(ud_channel_t* ud_c); void wings_ud_channel_destroy( ud_channel_t* ud_c); // This must be used to destroy all ud_c (both CRD and // typical ud_c) // This is used to int only non-CRDs channels (CRDs are initialized internally) void wings_ud_channel_init(ud_channel_t* ud_c, char* qp_name, enum channel_type type, uint8_t max_coalescing, uint16_t max_req_size, uint16_t small_req_size, uint8_t enable_inlining, uint8_t is_header_only, uint8_t is_bcast, // Credits uint8_t disable_crd_ctrl, uint8_t expl_crd_ctrl, ud_channel_t* linked_channel, uint16_t crds_per_channel, uint16_t num_channels, uint8_t channel_id, // Toggles uint8_t stats_on, uint8_t prints_on); void wings_setup_channel_qps_and_recvs(ud_channel_t** ud_c_array, uint16_t ud_c_num, dbit_vector_t* shared_rdy_var, uint16_t worker_lid); /// Main functions static inline uint16_t wings_poll_buff_and_post_recvs(ud_channel_t* ud_c, uint16_t max_pkts_to_poll, uint8_t* recv_buff_space); static inline uint8_t wings_issue_pkts( ud_channel_t* ud_c, bit_vector_t* membership, uint8_t* input_array_of_elems, uint16_t input_array_len, uint16_t size_of_input_elems, uint16_t* input_array_rolling_idx, skip_input_elem_or_get_dst_id_t skip_or_get_sender_id_func_ptr, modify_input_elem_after_send_t modify_elem_after_send, copy_and_modify_input_elem_t copy_and_modify_elem); static inline void wings_issue_credits( ud_channel_t* ud_c, bit_vector_t* membership, uint8_t* input_array_of_elems, uint16_t input_array_len, uint16_t size_of_input_elems, skip_input_elem_or_get_dst_id_t skip_or_get_sender_id_func_ptr, modify_input_elem_after_send_t modify_elem_after_send); #endif // WINGS_API_H ================================================ FILE: src/CR/crKV.c ================================================ // // Created by akatsarakis on 07/03/19. // #include #include ////////////////////////////////////////////////// //////////////////// Chain Replication / CRAQ KVS ////////////////////////////////////////////////// ////////////////////////////////////////////////// //////////// Helper functions //////////////////// static inline uint8_t head_id() { return 0; } static inline uint8_t tail_id() { return machine_num - 1; } //////////// Assertion functions static inline void cr_assertions_inv(spacetime_inv_t* inv_ptr) { assert(inv_ptr->op_meta.ts.version % 2 == 0); assert(inv_ptr->op_meta.opcode == ST_OP_INV || inv_ptr->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE); assert(inv_ptr->op_meta.val_len == ST_VALUE_SIZE); } //////////// Skip functions static inline uint8_t cr_skip_op(spacetime_op_t* op_ptr) { return (uint8_t)((op_ptr->op_meta.state == ST_PUT_SUCCESS || op_ptr->op_meta.state == ST_IN_PROGRESS_GET || op_ptr->op_meta.state == ST_IN_PROGRESS_PUT) ? 1 : 0); } static inline uint8_t cr_skip_inv(spacetime_inv_t* inv_ptr) { return (uint8_t)(inv_ptr->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE ? 1 : 0); } static inline uint8_t cr_skip_ack(spacetime_ack_t* ack_ptr) { return (uint8_t)(ack_ptr->opcode == ST_OP_MEMBERSHIP_CHANGE ? 1 : 0); } static inline uint8_t cr_skip_remote_reads(spacetime_op_t* op_ptr) { return (uint8_t)((op_ptr->op_meta.state == ST_EMPTY) ? 1 : 0); } static inline uint8_t cr_skip_remote_writes(spacetime_op_t* op_ptr) { return (uint8_t)((op_ptr->op_meta.state == ST_EMPTY || op_ptr->op_meta.state == ST_PUT_SUCCESS || op_ptr->op_meta.state == ST_IN_PROGRESS_PUT) ? 1 : 0); } //////////// Exec functions static inline void cr_exec_write(spacetime_op_t* op_ptr, struct mica_op* kv_ptr) { spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value; uint8_t* kv_value_ptr = (uint8_t*)&curr_meta[1]; if (ENABLE_ASSERTIONS) { assert(machine_id == head_id()); // Only head must exec writes assert(op_ptr->op_meta.opcode == ST_OP_PUT); assert(op_ptr->op_meta.val_len == ST_VALUE_SIZE); } op_ptr->op_meta.state = ST_EMPTY; cctrl_lock(&curr_meta->cctrl); switch (curr_meta->state) { case INVALID_STATE: // Do not initiate a new write until you get to valid state if (CR_ENABLE_BLOCKING_INVALID_WRITES_ON_HEAD) { cctrl_unlock_dec_version(&curr_meta->cctrl); op_ptr->op_meta.state = ST_PUT_STALL; break; } case VALID_STATE: curr_meta->state = INVALID_STATE; memcpy(kv_value_ptr, op_ptr->value, ST_VALUE_SIZE); kv_ptr->val_len = op_ptr->op_meta.val_len + sizeof(spacetime_object_meta); cctrl_unlock_inc_version(&curr_meta->cctrl, (uint8_t)machine_id, (uint32_t*)&(op_ptr->op_meta.ts.version)); op_ptr->op_meta.state = ST_PUT_SUCCESS; op_ptr->op_meta.ts.tie_breaker_id = (uint8_t)machine_id; break; default: assert(0); } } static inline void cr_exec_remote_reads(spacetime_op_t* op_ptr, struct mica_op* kv_ptr) { if (ENABLE_ASSERTIONS) { assert(machine_id == tail_id()); assert(op_ptr->op_meta.opcode == ST_OP_GET); } // the following variables used to validate atomicity between a lock-free read // of an object spacetime_object_meta prev_meta; spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value; uint8_t* kv_value_ptr = (uint8_t*)&curr_meta[1]; do { prev_meta = *curr_meta; // switch template with all states switch (curr_meta->state) { case VALID_STATE: memcpy(op_ptr->value, kv_value_ptr, ST_VALUE_SIZE); op_ptr->op_meta.state = ST_GET_COMPLETE; op_ptr->op_meta.val_len = kv_ptr->val_len - sizeof(spacetime_object_meta); break; case INVALID_STATE: default: assert(0); } } while ( !cctrl_timestamp_is_same_and_valid(&prev_meta.cctrl, &curr_meta->cctrl)); } static inline void cr_exec_op(spacetime_op_t* op_ptr, struct mica_op* kv_ptr, uint8_t idx) { if (ENABLE_ASSERTIONS) assert(idx < max_batch_size); // the following variables used to validate atomicity between a lock-free read // of an object spacetime_object_meta prev_meta; spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value; uint8_t* kv_value_ptr = (uint8_t*)&curr_meta[1]; if (op_ptr->op_meta.opcode == ST_OP_GET) { // Lock free reads through versioning (successful when version is even) op_ptr->op_meta.state = ST_EMPTY; do { prev_meta = *curr_meta; // switch template with all states switch (curr_meta->state) { case VALID_STATE: memcpy(op_ptr->value, kv_value_ptr, ST_VALUE_SIZE); op_ptr->op_meta.state = ST_GET_COMPLETE; op_ptr->op_meta.val_len = kv_ptr->val_len - sizeof(spacetime_object_meta); break; case INVALID_STATE: if (ENABLE_ASSERTIONS) assert(machine_id != tail_id()); // tail should always be valid op_ptr->op_meta.state = ST_GET_STALL; break; default: assert(0); } } while (!cctrl_timestamp_is_same_and_valid(&prev_meta.cctrl, &curr_meta->cctrl)); if (op_ptr->op_meta.state == ST_GET_STALL) op_ptr->buff_idx = idx; } else if (op_ptr->op_meta.opcode == ST_OP_PUT) { if (machine_id == head_id()) // if it is head cr_exec_write(op_ptr, kv_ptr); else op_ptr->op_meta.state = ST_PUT_SUCCESS; if (op_ptr->op_meta.state == ST_PUT_SUCCESS) // Set idx that we cannot set while dispatching the req op_ptr->buff_idx = idx; } } static inline void cr_complete_local_write(spacetime_op_t* read_write_op, uint8_t idx, const uint64_t* key) { /// completed read / write --> remove it from the ops buffer if (ENABLE_ASSERTIONS) { assert(read_write_op[idx].op_meta.state == ST_IN_PROGRESS_PUT); assert(((uint64_t*)&read_write_op[idx].op_meta.key)[0] == key[0]); } if (read_write_op[idx].op_meta.opcode == ST_OP_PUT) read_write_op[idx].op_meta.state = ST_PUT_COMPLETE; else assert(0); } static inline void cr_exec_inv(spacetime_inv_t* inv_ptr, struct mica_op* kv_ptr, spacetime_op_t* read_write_op) { // the following variables used to validate atomicity between a lock-free read // of an object spacetime_object_meta lock_free_meta; spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value; uint8_t* kv_value_ptr = (uint8_t*)&curr_meta[1]; if (ENABLE_ASSERTIONS) assert(inv_ptr->op_meta.opcode == ST_OP_INV); uint32_t debug_cntr = 0; do { // Lock free read of keys meta if (ENABLE_ASSERTIONS) { debug_cntr++; if (debug_cntr == M_4) { printf("Worker stuck on a lock-free read (for INV)\n"); debug_cntr = 0; } } lock_free_meta = *curr_meta; } while (!cctrl_timestamp_is_same_and_valid(&lock_free_meta.cctrl, &curr_meta->cctrl)); // lock and proceed iff remote.TS >= local.TS // inv TS >= local timestamp if (!timestamp_is_smaller(inv_ptr->op_meta.ts.version, inv_ptr->op_meta.ts.tie_breaker_id, lock_free_meta.cctrl.ts.version, lock_free_meta.cctrl.ts.tie_breaker_id)) { // Lock and check again if inv TS > local timestamp cctrl_lock(&curr_meta->cctrl); /// Warning: use op.version + 1 bellow since optik_lock() increases /// curr_meta->version by 1 if (timestamp_is_smaller( curr_meta->cctrl.ts.version - 1, curr_meta->cctrl.ts.tie_breaker_id, inv_ptr->op_meta.ts.version, inv_ptr->op_meta.ts.tie_breaker_id)) { // printf("Received // an invalidation with >= timestamp\n"); /// Update Value, TS and last_writer_id // curr_meta->last_writer_id = // inv_ptr->op_meta.sender; kv_ptr->val_len = inv_ptr->op_meta.val_len + sizeof(spacetime_object_meta); if (ENABLE_ASSERTIONS) { // assert(kv_ptr->val_len == // KVS_VALUE_SIZE //>> SHIFT_BITS); assert(inv_ptr->op_meta.val_len == ST_VALUE_SIZE >> SHIFT_BITS); } memcpy(kv_value_ptr, inv_ptr->value, ST_VALUE_SIZE); /// Update state switch (curr_meta->state) { case VALID_STATE: if (machine_id != tail_id()) // Tail never gets invalid curr_meta->state = INVALID_STATE; break; case INVALID_STATE: break; default: assert(0); } cctrl_unlock_custom_version(&curr_meta->cctrl, inv_ptr->op_meta.ts.tie_breaker_id, inv_ptr->op_meta.ts.version); } else if (timestamp_is_equal(curr_meta->cctrl.ts.version - 1, curr_meta->cctrl.ts.tie_breaker_id, inv_ptr->op_meta.ts.version, inv_ptr->op_meta.ts.tie_breaker_id)) assert(0); else cctrl_unlock_dec_version(&curr_meta->cctrl); } inv_ptr->op_meta.opcode = ST_INV_SUCCESS; if (inv_ptr->op_meta.initiator == machine_id && machine_id == tail_id()) cr_complete_local_write(read_write_op, inv_ptr->buff_idx, (uint64_t*)&inv_ptr->op_meta.key); if (ENABLE_ASSERTIONS) assert(inv_ptr->op_meta.opcode == ST_INV_SUCCESS); } static inline void cr_exec_ack(spacetime_ack_t* ack_ptr, struct mica_op* kv_ptr, spacetime_op_t* read_write_op) { if (ENABLE_ASSERTIONS) assert(machine_id != tail_id()); // the following variables used to validate atomicity between a lock-free read // of an object spacetime_object_meta lock_free_read_meta; spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value; if (ack_ptr->opcode != ST_OP_ACK) assert(0); uint32_t debug_cntr = 0; do { // Lock free read of keys meta if (ENABLE_ASSERTIONS) { debug_cntr++; if (debug_cntr == M_4) { printf("Worker stuck on a lock-free read (for ACK)\n"); debug_cntr = 0; } } lock_free_read_meta = *curr_meta; } while (!cctrl_timestamp_is_same_and_valid(&lock_free_read_meta.cctrl, &curr_meta->cctrl)); if (ENABLE_ASSERTIONS) assert(!timestamp_is_smaller(lock_free_read_meta.cctrl.ts.version, lock_free_read_meta.cctrl.ts.tie_breaker_id, ack_ptr->ts.version, ack_ptr->ts.tie_breaker_id)); if (timestamp_is_equal(ack_ptr->ts.version, ack_ptr->ts.tie_breaker_id, lock_free_read_meta.cctrl.ts.version, lock_free_read_meta.cctrl.ts.tie_breaker_id)) { /// Lock and check again if ack TS == last local write cctrl_lock(&curr_meta->cctrl); if (timestamp_is_equal(ack_ptr->ts.version, ack_ptr->ts.tie_breaker_id, curr_meta->cctrl.ts.version - 1, curr_meta->cctrl.ts.tie_breaker_id)) { switch (curr_meta->state) { case INVALID_STATE: curr_meta->state = VALID_STATE; ack_ptr->opcode = ST_LAST_ACK_SUCCESS; break; case VALID_STATE: default: assert(0); } } cctrl_unlock_dec_version(&curr_meta->cctrl); } if (machine_id == ack_ptr->initiator) cr_complete_local_write(read_write_op, ack_ptr->buff_idx, (uint64_t*)&ack_ptr->key); ack_ptr->opcode = ST_LAST_ACK_SUCCESS; } //////////// Dispatcher functions static inline uint8_t cr_skip_dispatcher(enum cr_type_t cr_type, void* ptr) { switch (cr_type) { case Local_ops: return cr_skip_op(ptr); case Invs: return cr_skip_inv(ptr); case Acks: return cr_skip_ack(ptr); case Remote_reads: return cr_skip_remote_reads(ptr); case Remote_writes: return cr_skip_remote_writes(ptr); default: assert(0); } } static inline void cr_assertions_dispatcher(enum cr_type_t cr_type, void* ptr) { if (ENABLE_ASSERTIONS) switch (cr_type) { case Invs: cr_assertions_inv(ptr); case Acks: case Remote_writes: case Local_ops: case Remote_reads: break; default: assert(0); } } static inline void cr_exec_dispatcher(enum cr_type_t cr_type, void* op_ptr, struct mica_op* kv_ptr, uint8_t idx, spacetime_op_t* read_write_op) { switch (cr_type) { case Invs: cr_exec_inv(op_ptr, kv_ptr, read_write_op); break; case Acks: cr_exec_ack(op_ptr, kv_ptr, read_write_op); break; case Remote_writes: cr_exec_write(op_ptr, kv_ptr); break; case Local_ops: cr_exec_op(op_ptr, kv_ptr, idx); break; case Remote_reads: cr_exec_remote_reads(op_ptr, kv_ptr); break; default: assert(0); } } ////////////////////////////////////////////////// //////////// Batch function ////////////////////// void cr_batch_ops_to_KVS(enum cr_type_t cr_type, uint8_t* op_array, int op_num, uint16_t sizeof_op_elem, spacetime_op_t* read_write_op) { #if SPACETIME_DEBUG == 1 // assert(kv.hash_table != NULL); assert(op_array != NULL); assert(op_num > 0 && op_num <= CACHE_BATCH_SIZE); assert(resp != NULL); #endif #if SPACETIME_DEBUG == 2 for (I = 0; I < op_num; I++) mica_print_op(&(*op_array)[I]); #endif int key_in_store[CR_MAX_BATCH_SIZE]; // Is this key in the datastore? unsigned int tag[CR_MAX_BATCH_SIZE]; // unsigned int bkt[CR_MAX_BATCH_SIZE]; uint64_t bkt[CR_MAX_BATCH_SIZE]; struct mica_bkt* bkt_ptr[CR_MAX_BATCH_SIZE]; struct mica_op* kv_ptr[CR_MAX_BATCH_SIZE]; // Ptr to KV item in log if (ENABLE_ASSERTIONS) assert(read_write_op != NULL || cr_type != Acks); // We first lookup the key in the datastore. // The first two @I loops work for both GETs and PUTs. for (int I = 0; I < op_num; I++) { spacetime_op_meta_t* op_ptr = (spacetime_op_meta_t*)&op_array[sizeof_op_elem * I]; cr_assertions_dispatcher(cr_type, op_ptr); if (cr_skip_dispatcher(cr_type, op_ptr)) continue; bkt[I] = op_ptr->key.bkt & kv.hash_table.bkt_mask; bkt_ptr[I] = &kv.hash_table.ht_index[bkt[I]]; __builtin_prefetch(bkt_ptr[I], 0, 0); tag[I] = op_ptr->key.tag; key_in_store[I] = 0; kv_ptr[I] = NULL; } for (int I = 0; I < op_num; I++) { spacetime_op_meta_t* op_ptr = (spacetime_op_meta_t*)&op_array[sizeof_op_elem * I]; if (cr_skip_dispatcher(cr_type, op_ptr)) continue; for (int j = 0; j < 8; j++) { if (bkt_ptr[I]->slots[j].in_use == 1 && bkt_ptr[I]->slots[j].tag == tag[I]) { uint64_t log_offset = bkt_ptr[I]->slots[j].offset & kv.hash_table.log_mask; // We can interpret the log entry as mica_op, even though it // may not contain the full MICA_MAX_VALUE value. kv_ptr[I] = (struct mica_op*)&kv.hash_table.ht_log[log_offset]; // Small values (1--64 bytes) can span 2 cache lines __builtin_prefetch(kv_ptr[I], 0, 0); __builtin_prefetch((uint8_t*)kv_ptr[I] + 64, 0, 0); // Detect if the head has wrapped around for this index entry if (kv.hash_table.log_head - bkt_ptr[I]->slots[j].offset >= kv.hash_table.log_cap) { kv_ptr[I] = NULL; // If so, we mark it "not found" } break; } } } for (int I = 0; I < op_num; I++) { spacetime_op_meta_t* op_ptr = (spacetime_op_meta_t*)&op_array[sizeof_op_elem * I]; if (cr_skip_dispatcher(cr_type, op_ptr)) continue; if (kv_ptr[I] != NULL) { // We had a tag match earlier. Now compare log entry. long long* key_ptr_log = (long long*)kv_ptr[I]; long long* key_ptr_req = (long long*)&op_ptr->key; if (key_ptr_log[1] == key_ptr_req[0]) { // Key Found 8 Byte keys key_in_store[I] = 1; cr_exec_dispatcher(cr_type, op_ptr, kv_ptr[I], (uint8_t)I, read_write_op); } } if (key_in_store[I] == 0) // KVS miss --> We get here if either tag or log key match failed op_ptr->state = ST_MISS; } if (ENABLE_ASSERTIONS) if (cr_type == Acks) for (int I = 0; I < max_batch_size; I++) assert(read_write_op[I].op_meta.opcode == ST_OP_GET || read_write_op[I].op_meta.state == ST_MISS || read_write_op[I].op_meta.state == ST_EMPTY || read_write_op[I].op_meta.state == ST_PUT_STALL || read_write_op[I].op_meta.state == ST_PUT_SUCCESS || read_write_op[I].op_meta.state == ST_PUT_COMPLETE || read_write_op[I].op_meta.state == ST_IN_PROGRESS_PUT || read_write_op[I].op_meta.state == ST_OP_MEMBERSHIP_CHANGE || /// TODO check this read_write_op[I].op_meta.state == ST_IN_PROGRESS_REPLAY); } ================================================ FILE: src/CR/cr_worker.c ================================================ #include #include #include "../../include/utils/concur_ctrl.h" #include "inline-util.h" #include "util.h" /// #include "../../include/utils/time_rdtsc.h" #include "../../include/wings/wings.h" /// static inline uint8_t head_id(void) { return (uint8_t)0; } static inline uint8_t tail_id(void) { return machine_num - 1; } static inline uint8_t next_node_in_chain(void) { return (uint8_t)((machine_id + 1) % machine_num); } static inline uint8_t prev_node_in_chain(void) { return (uint8_t)(machine_id == 0 ? tail_id() : machine_id - 1); } int inv_skip_or_fwd_to_next_node(uint8_t* req) { spacetime_inv_t* inv_req = (spacetime_inv_t*)req; return inv_req->op_meta.opcode == ST_INV_SUCCESS ? next_node_in_chain() : -1; // invs should only be fwded to next node } void inv_fwd_modify_elem_after_send(uint8_t* req) { spacetime_inv_t* inv_req = (spacetime_inv_t*)req; // empty inv buffer if (inv_req->op_meta.opcode == ST_INV_SUCCESS || inv_req->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE) inv_req->op_meta.opcode = ST_EMPTY; else assert(0); } void inv_fwd_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req) { spacetime_inv_t* inv_recv = (spacetime_inv_t*)triggering_req; spacetime_inv_t* inv_to_send = (spacetime_inv_t*)msg_to_send; // Copy op to inv and set opcode memcpy(inv_to_send, inv_recv, sizeof(spacetime_inv_t)); inv_to_send->op_meta.opcode = ST_OP_INV; } int inv_skip_or_get_sender_id(uint8_t* req) { spacetime_op_t* op_req = (spacetime_op_t*)req; if (ENABLE_ASSERTIONS) { assert(is_input_code(op_req->op_meta.opcode)); assert(is_response_code(op_req->op_meta.state) || is_bucket_state_code(op_req->op_meta.state)); } return op_req->op_meta.state == ST_PUT_SUCCESS ? next_node_in_chain() : -1; // since invs should only be fwded to next node } void inv_modify_elem_after_send(uint8_t* req) { spacetime_op_t* op_req = (spacetime_op_t*)req; if (op_req->op_meta.state == ST_PUT_SUCCESS) op_req->op_meta.state = ST_IN_PROGRESS_PUT; else assert(0); } void inv_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req) { if (ENABLE_ASSERTIONS) assert(machine_id == head_id()); spacetime_op_t* op = (spacetime_op_t*)triggering_req; spacetime_inv_t* inv_to_send = (spacetime_inv_t*)msg_to_send; // Copy op to inv, set sender and opcode memcpy(inv_to_send, op, sizeof(spacetime_inv_t)); inv_to_send->op_meta.opcode = ST_OP_INV; inv_to_send->op_meta.initiator = (uint8_t)machine_id; } int remote_write_skip_or_get_sender_id(uint8_t* req) { spacetime_op_t* op_req = (spacetime_op_t*)req; if (ENABLE_ASSERTIONS) { assert(is_input_code(op_req->op_meta.opcode)); assert(is_response_code(op_req->op_meta.state) || is_bucket_state_code(op_req->op_meta.state)); } return op_req->op_meta.state == ST_PUT_SUCCESS ? head_id() : -1; // send remote writes to head } void remote_write_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req) { if (ENABLE_ASSERTIONS) assert(machine_id != head_id()); spacetime_op_t* op = (spacetime_op_t*)triggering_req; spacetime_inv_t* inv_to_send = (spacetime_inv_t*)msg_to_send; // Copy op to inv, set sender and opcode memcpy(inv_to_send, op, sizeof(spacetime_inv_t)); inv_to_send->op_meta.state = ST_NEW; inv_to_send->op_meta.opcode = ST_OP_PUT; inv_to_send->initiator = (uint8_t)machine_id; inv_to_send->op_meta.initiator = (uint8_t)machine_id; } int remote_write_head_skip_or_get_sender_id(uint8_t* req) { spacetime_op_t* op_req = (spacetime_op_t*)req; if (ENABLE_ASSERTIONS) { assert(machine_id == head_id()); assert(is_input_code(op_req->op_meta.opcode) || op_req->op_meta.opcode == ST_EMPTY); assert(is_response_code(op_req->op_meta.state) || is_bucket_state_code(op_req->op_meta.state)); } return op_req->op_meta.state == ST_PUT_SUCCESS ? next_node_in_chain() : -1; // remote writes must always be fwded to head } void remote_write_head_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req) { spacetime_op_t* op = (spacetime_op_t*)triggering_req; spacetime_inv_t* inv_to_send = (spacetime_inv_t*)msg_to_send; // Copy op to inv, set sender and opcode memcpy(inv_to_send, op, sizeof(spacetime_inv_t)); inv_to_send->op_meta.opcode = ST_OP_INV; inv_to_send->op_meta.initiator = op->initiator; } void remote_write_head_modify_elem_after_send(uint8_t* req) { spacetime_op_t* op_req = (spacetime_op_t*)req; if (op_req->op_meta.state == ST_PUT_SUCCESS) op_req->op_meta.state = ST_SEND_CRD; else assert(0); } void ack_fwd_modify_elem_after_send(uint8_t* req) { spacetime_ack_t* ack_req = (spacetime_ack_t*)req; if (ENABLE_ASSERTIONS) assert(ack_req->opcode == ST_LAST_ACK_SUCCESS); ack_req->opcode = ST_EMPTY; } int ack_fwd_skip_or_get_sender_id(uint8_t* req) { spacetime_ack_t* ack_req = (spacetime_ack_t*)req; if (ack_req->opcode == ST_ACK_SUCCESS) { ack_req->opcode = ST_EMPTY; return -1; } else if (ack_req->opcode == ST_EMPTY) return -1; if (ENABLE_ASSERTIONS) assert(ack_req->opcode == ST_LAST_ACK_SUCCESS); return prev_node_in_chain(); } void ack_fwd_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req) { spacetime_ack_t* ack_to_send = (spacetime_ack_t*)msg_to_send; memcpy(ack_to_send, triggering_req, sizeof(spacetime_ack_t)); // copy req to next_req_ptr ack_to_send->opcode = ST_OP_ACK; } int ack_skip_or_get_sender_id(uint8_t* req) { spacetime_inv_t* inv_req = (spacetime_inv_t*)req; if (ENABLE_ASSERTIONS) assert(inv_req->op_meta.opcode == ST_INV_SUCCESS || inv_req->op_meta.opcode == ST_EMPTY); return prev_node_in_chain(); } void ack_modify_elem_after_send(uint8_t* req) { spacetime_inv_t* inv_req = (spacetime_inv_t*)req; // empty inv buffer if (inv_req->op_meta.opcode == ST_INV_SUCCESS || inv_req->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE) inv_req->op_meta.opcode = ST_EMPTY; else assert(0); } void ack_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req) { spacetime_ack_t* ack_to_send = (spacetime_ack_t*)msg_to_send; spacetime_inv_t* inv_ptr = (spacetime_inv_t*)triggering_req; memcpy(ack_to_send, inv_ptr, sizeof(spacetime_ack_t)); // copy req to next_req_ptr ack_to_send->opcode = ST_OP_ACK; ack_to_send->buff_idx = inv_ptr->buff_idx; } int rem_write_crd_skip_or_get_sender_id(uint8_t* req) { spacetime_op_t* op_ptr = (spacetime_op_t*)req; if (ENABLE_ASSERTIONS) assert(op_ptr->op_meta.state == ST_EMPTY || op_ptr->op_meta.state == ST_SEND_CRD || op_ptr->op_meta.state == ST_PUT_STALL || op_ptr->op_meta.state == ST_PUT_SUCCESS); return op_ptr->op_meta.state == ST_SEND_CRD ? op_ptr->initiator : -1; } void rem_write_crd_modify_elem_after_send(uint8_t* req) { spacetime_op_t* op = (spacetime_op_t*)req; // empty inv buffer if (op->op_meta.state == ST_SEND_CRD) op->op_meta.state = ST_EMPTY; else assert(0); } int inv_crd_skip_or_get_sender_id(uint8_t* req) { spacetime_inv_t* op_ptr = (spacetime_inv_t*)req; if (ENABLE_ASSERTIONS) assert(op_ptr->op_meta.opcode == ST_EMPTY || op_ptr->op_meta.opcode == ST_INV_SUCCESS); return op_ptr->op_meta.opcode == ST_INV_SUCCESS ? prev_node_in_chain() : -1; } void inv_crd_modify_elem_after_send(uint8_t* req) { if (ENABLE_ASSERTIONS) { spacetime_inv_t* op = (spacetime_inv_t*)req; assert(op->op_meta.opcode == ST_INV_SUCCESS); } } int remote_read_skip_or_get_sender_id(uint8_t* req) { spacetime_op_t* op_req = (spacetime_op_t*)req; if (ENABLE_ASSERTIONS) { assert(is_input_code(op_req->op_meta.opcode)); assert(is_response_code(op_req->op_meta.state) || is_bucket_state_code(op_req->op_meta.state)); } return op_req->op_meta.state == ST_GET_STALL ? tail_id() : -1; // send remote writes to head } void remote_read_modify_elem_after_send(uint8_t* req) { spacetime_op_t* op_req = (spacetime_op_t*)req; if (op_req->op_meta.state == ST_GET_STALL) op_req->op_meta.state = ST_IN_PROGRESS_GET; else assert(0); } void remote_read_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req) { if (ENABLE_ASSERTIONS) assert(machine_id != tail_id()); spacetime_op_t* op = (spacetime_op_t*)triggering_req; spacetime_op_t* op_to_send = (spacetime_op_t*)msg_to_send; // Copy op to inv, set sender and opcode memcpy(op_to_send, op, sizeof(spacetime_op_t)); op_to_send->op_meta.state = ST_NEW; op_to_send->op_meta.opcode = ST_OP_GET; op_to_send->initiator = (uint8_t)machine_id; op_to_send->op_meta.initiator = (uint8_t)machine_id; } int remote_read_resp_skip_or_get_sender_id(uint8_t* req) { spacetime_op_t* op_req = (spacetime_op_t*)req; if (ENABLE_ASSERTIONS) { if (op_req->op_meta.opcode != ST_OP_GET) { printf("Opcode: %d, state: %d\n", op_req->op_meta.opcode, op_req->op_meta.state); printf("Opcode: %s, state: %s\n", code_to_str(op_req->op_meta.opcode), code_to_str(op_req->op_meta.state)); } assert(op_req->op_meta.opcode == ST_OP_GET); assert(op_req->op_meta.state == ST_GET_COMPLETE); } return op_req->initiator; // send remote writes to head } void remote_read_resp_modify_elem_after_send(uint8_t* req) { spacetime_op_t* op_req = (spacetime_op_t*)req; if (op_req->op_meta.state == ST_GET_COMPLETE) op_req->op_meta.state = ST_EMPTY; else { printf("St_opcode: %s\n", code_to_str(op_req->op_meta.state)); assert(0); } } void remote_read_resp_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req) { if (ENABLE_ASSERTIONS) assert(machine_id == tail_id()); spacetime_op_t* op = (spacetime_op_t*)triggering_req; spacetime_op_t* op_to_send = (spacetime_op_t*)msg_to_send; // Copy op to inv, set sender and opcode memcpy(op_to_send, op, sizeof(spacetime_op_t)); } void print_ops_and_remote_write_ops(spacetime_op_t* ops, spacetime_op_t* remote_writes) { // for(int i = 0; i < MAX_BATCH_KVS_OPS_SIZE; ++i) for (int i = 0; i < max_batch_size; ++i) printf("ops[%d]: state-> %s, key-> %lu \n", i, code_to_str(ops[i].op_meta.state), *((uint64_t*)&ops[i].op_meta.key)); if (machine_id == head_id()) // for(int i = 0; i < MAX_BATCH_KVS_OPS_SIZE; ++i) for (int i = 0; i < max_batch_size; ++i) printf("remote_writes[%d]: state-> %s, key-> %lu \n", i, code_to_str(remote_writes[i].op_meta.state), *((uint64_t*)&remote_writes[i].op_meta.key)); } void print_total_stalls_due_to_credits(ud_channel_t* inv_ud_c, ud_channel_t* ack_ud_c, ud_channel_t* rem_writes_ud_c, ud_channel_t* rem_reads_ud_c) { // Stalls colored_printf(GREEN, "$$$ CRD STALLs : %s %d, %s %d, %s %d,", inv_ud_c->qp_name, inv_ud_c->stats.send_total_msgs, ack_ud_c->qp_name, ack_ud_c->stats.send_total_msgs, rem_writes_ud_c->qp_name, rem_writes_ud_c->stats.send_total_msgs); if (CR_ENABLE_REMOTE_READS) colored_printf(GREEN, ", %s %d\n", rem_reads_ud_c->qp_name, rem_reads_ud_c->stats.send_total_msgs); else printf("\n"); } void print_total_send_recv_msgs_n_credits( ud_channel_t* inv_ud_c, ud_channel_t* inv_crd_ud_c, ud_channel_t* ack_ud_c, ud_channel_t* rem_writes_ud_c, ud_channel_t* crd_ud_c, ud_channel_t* rem_reads_ud_c, ud_channel_t* rem_read_resp_ud_c) { // Sends colored_printf(GREEN, "--> Total Send: %s %d", inv_ud_c->qp_name, inv_ud_c->stats.send_total_msgs); if (CR_ENABLE_EARLY_INV_CRDS) colored_printf(GREEN, ", %s %d", inv_crd_ud_c->qp_name, inv_crd_ud_c->stats.send_total_msgs); colored_printf(GREEN, ", %s %d, %s %d, %s %d", ack_ud_c->qp_name, ack_ud_c->stats.send_total_msgs, rem_writes_ud_c->qp_name, rem_writes_ud_c->stats.send_total_msgs, crd_ud_c->qp_name, crd_ud_c->stats.send_total_msgs); if (CR_ENABLE_REMOTE_READS) colored_printf(GREEN, ", %s %d, %s %d\n", rem_reads_ud_c->qp_name, rem_reads_ud_c->stats.send_total_msgs, rem_read_resp_ud_c->qp_name, rem_read_resp_ud_c->stats.send_total_msgs); else printf("\n"); // Receives colored_printf(GREEN, "vvv Total Recv: %s %d", inv_ud_c->qp_name, inv_ud_c->stats.recv_total_msgs); if (CR_ENABLE_EARLY_INV_CRDS) colored_printf(GREEN, ", %s %d", inv_crd_ud_c->qp_name, inv_crd_ud_c->stats.recv_total_msgs); colored_printf(GREEN, ", %s %d, %s %d, %s %d", ack_ud_c->qp_name, ack_ud_c->stats.recv_total_msgs, rem_writes_ud_c->qp_name, rem_writes_ud_c->stats.recv_total_msgs, crd_ud_c->qp_name, crd_ud_c->stats.recv_total_msgs); if (CR_ENABLE_REMOTE_READS) colored_printf(GREEN, ", %s %d, %s %d\n", rem_reads_ud_c->qp_name, rem_reads_ud_c->stats.recv_total_msgs, rem_read_resp_ud_c->qp_name, rem_read_resp_ud_c->stats.recv_total_msgs); else printf("\n"); // Credits uint8_t remote_node = (uint8_t)(machine_id == head_id() ? next_node_in_chain() : head_id()); printf("Inv credits: %d, ack credits: %d, remote_write_crds: %d\n", inv_ud_c->credits_per_channels[remote_node], ack_ud_c->credits_per_channels[remote_node], rem_writes_ud_c->credits_per_channels[head_id()]); } static inline void cr_complete_local_reads(spacetime_op_t* remote_reads_resps, uint16_t remote_read_resps_polled, spacetime_op_t* ops) { for (int i = 0; i < remote_read_resps_polled; ++i) { uint16_t idx = remote_reads_resps[i].buff_idx; /// completed read / write --> remove it from the ops buffer if (ENABLE_ASSERTIONS) { assert(ops[idx].op_meta.state == ST_IN_PROGRESS_GET); assert(((uint64_t*)&ops[idx].op_meta.key)[0] == ((uint64_t*)&remote_reads_resps[i].op_meta.key)[0]); } if (ops[idx].op_meta.opcode == ST_OP_GET) ops[idx].op_meta.state = ST_GET_COMPLETE; else assert(0); } } // returns first free slot within a range [start_pos, end_pos) or -1 if all are // occupied static inline int get_first_free_slot(const uint8_t* free_slot_array, uint16_t start_pos, uint16_t end_pos) { if (ENABLE_ASSERTIONS) assert(end_pos > start_pos); for (int i = start_pos; i < end_pos; ++i) if (free_slot_array[i] == 1) return i; return -1; } static inline uint16_t cr_move_stalled_writes_to_top_n_return_free_space(spacetime_op_t* remote_writes) { uint8_t free_slot_array[MAX_BATCH_KVS_OPS_SIZE] = {0}; uint16_t free_slots = 0; uint16_t last_free_slot = 0; // used to avoid re-iterating already non-empty slots for (int i = 0; i < max_batch_size; ++i) { if (ENABLE_ASSERTIONS) assert(remote_writes[i].op_meta.state == ST_EMPTY || remote_writes[i].op_meta.state == ST_PUT_STALL || remote_writes[i].op_meta.state == ST_PUT_SUCCESS); if (remote_writes[i].op_meta.state == ST_EMPTY) { free_slots++; free_slot_array[i] = 1; } else if (free_slots > 0 && (remote_writes[i].op_meta.state == ST_PUT_STALL || remote_writes[i].op_meta.state == ST_PUT_SUCCESS)) { int next_free_slot = get_first_free_slot(free_slot_array, last_free_slot, (uint16_t)i); if (next_free_slot > -1) { free_slot_array[i] = 1; free_slot_array[next_free_slot] = 0; last_free_slot = (uint16_t)next_free_slot; // swap stalled request to the first free slot memcpy(&remote_writes[next_free_slot], &remote_writes[i], sizeof(spacetime_op_t)); // empty this slot remote_writes[i].op_meta.state = ST_EMPTY; remote_writes[i].op_meta.opcode = ST_EMPTY; } } } if (ENABLE_ASSERTIONS) for (int i = 0; i < max_batch_size; ++i) { if (i < max_batch_size - free_slots) assert(remote_writes[i].op_meta.state == ST_PUT_STALL || remote_writes[i].op_meta.state == ST_PUT_SUCCESS); else assert(remote_writes[i].op_meta.state == ST_EMPTY); } return free_slots; } static inline void debugg(spacetime_op_t* ops, uint16_t worker_lid, int line_no) { if (w_stats[worker_lid].total_loops > 0) for (int i = 0; i < max_batch_size; ++i) { if (!(ops[i].op_meta.opcode == ST_OP_PUT || ops[i].op_meta.opcode == ST_OP_GET)) printf("Line[%d]--> Op[%d]: %s, loop iter: %llu\n", line_no, i, code_to_str(ops[i].op_meta.opcode), w_stats[worker_lid].total_loops); assert(ops[i].op_meta.opcode == ST_OP_PUT || ops[i].op_meta.opcode == ST_OP_GET); } } void* run_worker(void* arg) { assert(rmw_ratio == 0); assert(is_CR == 1); assert(credits_num % machine_num == 0); // CR ONLY assert(ENABLE_COALESCE_OF_HOT_REQS == 0); /// WARNING: only defines (no dynamically passed cli arguments) work for cr /// worker assert(max_coalesce <= MAX_REQ_COALESCE); assert(num_workers <= MAX_WORKERS_PER_MACHINE); assert(max_batch_size <= MAX_BATCH_KVS_OPS_SIZE); assert(credits_num <= MAX_CREDITS_PER_REMOTE_WORKER_CR); const uint16_t credit_num = MAX_CREDITS_PER_REMOTE_WORKER_CR; struct thread_params params = *(struct thread_params*)arg; uint16_t worker_lid = (uint16_t)params.id; // Local ID of this worker thread uint16_t worker_gid = (uint16_t)(machine_id * num_workers + params.id); // Global ID of this worker thread // TODO check if the previous assignment (below is the correct one) // uint16_t worker_gid = (uint16_t) (machine_id * MAX_WORKERS_PER_MACHINE + // params.id); // Global ID of this worker thread /* -------------------------------------------------------- ------------------- RDMA WINGS DECLARATIONS--------------- ---------------------------------------------------------*/ ud_channel_t ud_channels[CR_TOTAL_WORKER_UD_QPs]; ud_channel_t* ud_channel_ptrs[CR_TOTAL_WORKER_UD_QPs]; for (int i = 0; i < CR_TOTAL_WORKER_UD_QPs; ++i) ud_channel_ptrs[i] = &ud_channels[i]; ud_channel_t* inv_ud_c = ud_channel_ptrs[CR_INV_UD_QP_ID]; ud_channel_t* inv_crd_ud_c = ud_channel_ptrs[CR_INV_CRD_UD_QP_ID]; ud_channel_t* ack_ud_c = ud_channel_ptrs[CR_ACK_UD_QP_ID]; ud_channel_t* rem_reads_ud_c = ud_channel_ptrs[CR_REMOTE_READS_UD_QP_ID]; ud_channel_t* rem_read_resp_ud_c = ud_channel_ptrs[CR_REMOTE_READS_RESP_UD_QP_ID]; ud_channel_t* rem_writes_ud_c = ud_channel_ptrs[CR_REMOTE_WRITES_UD_QP_ID]; ud_channel_t* rem_writes_crd_ud_c = ud_channel_ptrs[CR_REMOTE_WRITE_CRD_UD_QP_ID]; const uint8_t is_bcast = 0; const uint8_t stats_on = 1; const uint8_t prints_on = 1; const uint8_t is_hdr_only = 0; const uint8_t expl_crd_ctrl = 0; const uint8_t disable_crd_ctrl = 0; char inv_qp_name[200], ack_qp_name[200], rem_writes_qp_name[200], rem_reads_qp_name[200], rem_read_resps_qp_name[200]; sprintf(inv_qp_name, "%s[%d]", "\033[31mINV\033[0m", worker_lid); sprintf(ack_qp_name, "%s[%d]", "\033[33mACK\033[0m", worker_lid); sprintf(rem_writes_qp_name, "%s[%d]", "\033[1m\033[32mREMOTE_WRITES\033[0m", worker_lid); sprintf(rem_reads_qp_name, "%s[%d]", "\033[1m\033[32mREMOTE_READS\033[0m", worker_lid); sprintf(rem_read_resps_qp_name, "%s[%d]", "\033[1m\033[32mREMOTE_READ_RESPS\033[0m", worker_lid); uint8_t inv_inlining = (DISABLE_INLINING == 0 && max_coalesce * sizeof(spacetime_inv_t) < WINGS_MAX_SUPPORTED_INLINING) ? 1 : 0; uint8_t ack_inlining = (DISABLE_INLINING == 0 && max_coalesce * sizeof(spacetime_ack_t) < WINGS_MAX_SUPPORTED_INLINING) ? 1 : 0; uint8_t rem_writes_inlining = inv_inlining; uint8_t rem_reads_inlining = inv_inlining; if (CR_ENABLE_EARLY_INV_CRDS) { wings_ud_channel_init(inv_ud_c, inv_qp_name, REQ, MAX_REQ_COALESCE, sizeof(spacetime_inv_t), 0, inv_inlining, is_hdr_only, is_bcast, disable_crd_ctrl, 1, inv_crd_ud_c, credit_num, machine_num, (uint8_t)machine_id, stats_on, prints_on); wings_ud_channel_init( ack_ud_c, ack_qp_name, RESP, MAX_REQ_COALESCE, sizeof(spacetime_ack_t), 0, ack_inlining, is_hdr_only, is_bcast, 1, expl_crd_ctrl, NULL, CR_ACK_CREDITS, machine_num, (uint8_t)machine_id, stats_on, prints_on); } else { wings_ud_channel_init(inv_ud_c, inv_qp_name, REQ, MAX_REQ_COALESCE, sizeof(spacetime_inv_t), 0, inv_inlining, is_hdr_only, is_bcast, disable_crd_ctrl, expl_crd_ctrl, ack_ud_c, credit_num, machine_num, (uint8_t)machine_id, stats_on, prints_on); wings_ud_channel_init(ack_ud_c, ack_qp_name, RESP, MAX_REQ_COALESCE, sizeof(spacetime_ack_t), 0, ack_inlining, is_hdr_only, is_bcast, disable_crd_ctrl, expl_crd_ctrl, inv_ud_c, credit_num, machine_num, (uint8_t)machine_id, stats_on, prints_on); } const uint16_t cr_remote_write_credits = credit_num / machine_num; wings_ud_channel_init( rem_writes_ud_c, rem_writes_qp_name, REQ, MAX_REQ_COALESCE, sizeof(spacetime_op_t), 0, rem_writes_inlining, is_hdr_only, is_bcast, disable_crd_ctrl, 1, rem_writes_crd_ud_c, cr_remote_write_credits, machine_num, (uint8_t)machine_id, stats_on, prints_on); /////////////// ///< 4th stage> if (CR_ENABLE_REMOTE_READS) { wings_ud_channel_init(rem_reads_ud_c, rem_reads_qp_name, REQ, MAX_REQ_COALESCE, sizeof(spacetime_op_t), 0, rem_reads_inlining, is_hdr_only, is_bcast, disable_crd_ctrl, expl_crd_ctrl, rem_read_resp_ud_c, CR_REMOTE_READS_CREDITS, machine_num, (uint8_t)machine_id, stats_on, prints_on); wings_ud_channel_init(rem_read_resp_ud_c, rem_read_resps_qp_name, RESP, MAX_REQ_COALESCE, sizeof(spacetime_op_t), 0, rem_reads_inlining, is_hdr_only, is_bcast, disable_crd_ctrl, expl_crd_ctrl, rem_reads_ud_c, CR_REMOTE_READS_CREDITS, machine_num, (uint8_t)machine_id, stats_on, prints_on); } /// /////////////// wings_setup_channel_qps_and_recvs(ud_channel_ptrs, CR_TOTAL_WORKER_UD_QPs, g_share_qs_barrier, worker_lid); /* ------------------------------------------------------- ------------------- OTHER DECLARATIONS-------------------- ---------------------------------------------------------*/ // Intermediate buffs where reqs are copied from incoming_* buffs in order to // get passed to the KVS spacetime_op_t* ops; spacetime_inv_t* inv_recv_ops; spacetime_ack_t* ack_recv_ops; spacetime_val_t* val_recv_ops; // UNUSED! uint32_t coh_ops_len = (uint32_t)(credits_num * machine_num * max_coalesce); // credits * remote_machines * max_req_coalesce setup_kvs_buffs(&ops, &inv_recv_ops, &ack_recv_ops, &val_recv_ops); // Remote writes init spacetime_op_t* remote_writes = memalign(4096, max_batch_size * (sizeof(spacetime_op_t))); memset(remote_writes, 0, max_batch_size * (sizeof(spacetime_op_t))); for (int i = 0; i < max_batch_size; ++i) { remote_writes[i].op_meta.state = ST_EMPTY; remote_writes[i].op_meta.opcode = ST_EMPTY; } /////////////// ///< 4th stage> // Remote reads buffer: used for polling remote reads on tail & remote read // responses on the rest nodes spacetime_op_t* remote_reads = memalign(4096, max_batch_size * (sizeof(spacetime_op_t))); memset(remote_reads, 0, max_batch_size * (sizeof(spacetime_op_t))); for (int i = 0; i < max_batch_size; ++i) { remote_reads[i].op_meta.state = ST_EMPTY; remote_reads[i].op_meta.opcode = ST_EMPTY; } /// /////////////// struct spacetime_trace_command* trace; trace_init(&trace, worker_gid); //// spacetime_op_t* n_hottest_keys_in_ops_get[COALESCE_N_HOTTEST_KEYS]; spacetime_op_t* n_hottest_keys_in_ops_put[COALESCE_N_HOTTEST_KEYS]; for (int i = 0; i < COALESCE_N_HOTTEST_KEYS; ++i) { n_hottest_keys_in_ops_get[i] = NULL; n_hottest_keys_in_ops_put[i] = NULL; } //// uint8_t has_outstanding_invs = 0; uint8_t has_outstanding_rem_writes = 0; uint32_t trace_iter = 0; uint16_t rolling_idx = 0, remote_reads_rolling_idx = 0; uint16_t invs_polled = 0, acks_polled = 0, remote_writes_polled = 0; uint32_t num_of_iters_serving_op[MAX_BATCH_KVS_OPS_SIZE] = {0}; uint16_t free_rem_write_slots = max_batch_size; /// Spawn stats thread if (worker_lid == 0) if (spawn_stats_thread() != 0) colored_printf(RED, "Stats thread was not successfully spawned \n"); struct timespec stopwatch_for_req_latency; /* ----------------------------------------------------- ------------------------Main Loop-------------------- ----------------------------------------------------- */ while (true) { if (unlikely(w_stats[worker_lid].total_loops % M_16 == 0)) { // Check something periodically // print_total_stalls_due_to_credits(inv_ud_c, ack_ud_c, // rem_writes_ud_c, rem_reads_ud_c); // print_total_send_recv_msgs_n_credits(inv_ud_c, // inv_crd_ud_c, ack_ud_c, // rem_writes_ud_c, rem_writes_crd_ud_c, // rem_reads_ud_c, rem_read_resp_ud_c); // print_ops_and_remote_write_ops(ops, remote_writes); } /// DONE // 1st stage: head only initiate requests // [DONE] 2nd stage: + rest nodes initiate (local) reads // [DONE] 3rd stage: + rest nodes initiate (remote) writes via head [DONE] // 4th stage: + rest nodes initiate remote reads when invalid [DONE] // 5th stage: + add early INV credits to pipeline more reqs [DONE] // 6th stage: + poll for remote writes even though stalled exist [DONE] // 7th stage: + poll for messages instead of pkts (ie if you have // empty space buff slots < max_coalesce poll pkt // and buffer additional packets [DONE] // 8th stage: + Do not stall writes that found Invalid on head [DONE] if (!CR_ENABLE_ONLY_HEAD_REQS || machine_id == head_id()) { refill_ops(&trace_iter, worker_lid, trace, ops, num_of_iters_serving_op, &stopwatch_for_req_latency, n_hottest_keys_in_ops_get, n_hottest_keys_in_ops_put); cr_batch_ops_to_KVS(Local_ops, (uint8_t*)ops, max_batch_size, sizeof(spacetime_op_t), NULL); // TODO: moved stop_latency_of_completed_reads(ops, worker_lid, &stopwatch_for_req_latency); } if (update_ratio > 0) { if (machine_id == head_id()) { const uint16_t max_outstanding_writes = (machine_num - 1) * CR_ACK_CREDITS; if (!CR_ENABLE_EARLY_INV_CRDS || inv_ud_c->stats.send_total_msgs - ack_ud_c->stats.recv_total_msgs <= max_outstanding_writes) { /// Initiate INVs for head writes wings_issue_pkts( inv_ud_c, NULL, (uint8_t*)ops, max_batch_size, sizeof(spacetime_op_t), &rolling_idx, inv_skip_or_get_sender_id, inv_modify_elem_after_send, inv_copy_and_modify_elem); } /////////////// ///< 3rd stage> if (!CR_ENABLE_ONLY_HEAD_REQS) { wings_poll_buff_and_post_recvs( rem_writes_ud_c, free_rem_write_slots, (uint8_t*)&remote_writes[max_batch_size - free_rem_write_slots]); cr_batch_ops_to_KVS(Remote_writes, (uint8_t*)remote_writes, max_batch_size, sizeof(spacetime_op_t), NULL); if (!CR_ENABLE_EARLY_INV_CRDS || inv_ud_c->stats.send_total_msgs - ack_ud_c->stats.recv_total_msgs <= max_outstanding_writes) { /// Initiate INVs for remotes /// writes wings_issue_pkts(inv_ud_c, NULL, (uint8_t*)remote_writes, max_batch_size, sizeof(spacetime_op_t), NULL, remote_write_head_skip_or_get_sender_id, remote_write_head_modify_elem_after_send, remote_write_head_copy_and_modify_elem); /// Issue credits for remotes writes wings_issue_credits(rem_writes_crd_ud_c, NULL, (uint8_t*)remote_writes, max_batch_size, sizeof(spacetime_op_t), rem_write_crd_skip_or_get_sender_id, rem_write_crd_modify_elem_after_send); } free_rem_write_slots = cr_move_stalled_writes_to_top_n_return_free_space(remote_writes); } } else if (!CR_ENABLE_ONLY_HEAD_REQS) /// Initiate Remote writes wings_issue_pkts(rem_writes_ud_c, NULL, (uint8_t*)ops, max_batch_size, sizeof(spacetime_op_t), &rolling_idx, remote_write_skip_or_get_sender_id, inv_modify_elem_after_send, remote_write_copy_and_modify_elem); /// /////////////// /////////////// ///< 4th stage> if (CR_ENABLE_REMOTE_READS) { if (machine_id == tail_id()) { /// Poll Remote reads uint16_t remote_reads_polled = wings_poll_buff_and_post_recvs( rem_reads_ud_c, max_batch_size, (uint8_t*)remote_reads); /// Batch Remote reads to KVS cr_batch_ops_to_KVS(Remote_reads, (uint8_t*)remote_reads, remote_reads_polled, sizeof(spacetime_op_t), NULL); /// Issue responses of Remote reads wings_issue_pkts(rem_read_resp_ud_c, NULL, (uint8_t*)remote_reads, remote_reads_polled, sizeof(spacetime_op_t), NULL, remote_read_resp_skip_or_get_sender_id, remote_read_resp_modify_elem_after_send, remote_read_resp_copy_and_modify_elem); } else { /// Initiate Remote reads wings_issue_pkts(rem_reads_ud_c, NULL, (uint8_t*)ops, max_batch_size, sizeof(spacetime_op_t), &remote_reads_rolling_idx, remote_read_skip_or_get_sender_id, remote_read_modify_elem_after_send, remote_read_copy_and_modify_elem); for (int i = 0; i < max_batch_size; i++) assert(ops[i].op_meta.opcode == ST_OP_PUT || ops[i].op_meta.opcode == ST_OP_GET); /// Poll respsonses of Remote reads uint16_t remote_read_resps_polled = wings_poll_buff_and_post_recvs( rem_read_resp_ud_c, max_batch_size, (uint8_t*)remote_reads); /// Complete Remote reads cr_complete_local_reads(remote_reads, remote_read_resps_polled, ops); stop_latency_of_completed_reads(ops, worker_lid, &stopwatch_for_req_latency); for (int i = 0; i < max_batch_size; i++) assert(ops[i].op_meta.opcode == ST_OP_PUT || ops[i].op_meta.opcode == ST_OP_GET); } } /// /////////////// if (machine_id != head_id()) { /// Poll for INVs if (has_outstanding_invs == 0) { invs_polled = wings_poll_buff_and_post_recvs(inv_ud_c, coh_ops_len, (uint8_t*)inv_recv_ops); if (invs_polled > 0) { /// Batch INVs to KVS cr_batch_ops_to_KVS(Invs, (uint8_t*)inv_recv_ops, invs_polled, sizeof(spacetime_inv_t), ops); if (CR_ENABLE_EARLY_INV_CRDS) /// Issue credits for INVs to previous node in chain wings_issue_credits(inv_crd_ud_c, NULL, (uint8_t*)inv_recv_ops, invs_polled, sizeof(spacetime_inv_t), inv_crd_skip_or_get_sender_id, inv_crd_modify_elem_after_send); } } if (invs_polled > 0) { /// Batch INVs to KVS if (machine_id != tail_id() && machine_id != head_id()) /// Forward INVS to next node in chain has_outstanding_invs = wings_issue_pkts( inv_ud_c, NULL, (uint8_t*)inv_recv_ops, invs_polled, sizeof(spacetime_inv_t), NULL, inv_skip_or_fwd_to_next_node, inv_fwd_modify_elem_after_send, inv_fwd_copy_and_modify_elem); else if (machine_id == tail_id()) { /// Initiate ACKS (forward to prev) has_outstanding_invs = wings_issue_pkts( ack_ud_c, NULL, (uint8_t*)inv_recv_ops, invs_polled, sizeof(spacetime_inv_t), NULL, ack_skip_or_get_sender_id, ack_modify_elem_after_send, ack_copy_and_modify_elem); if (ENABLE_ASSERTIONS) assert(ack_ud_c->stats.send_total_msgs == inv_ud_c->stats.recv_total_msgs - inv_ud_c->num_overflow_msgs); } } } if (machine_id != tail_id()) { /// Poll for Acks acks_polled = wings_poll_buff_and_post_recvs(ack_ud_c, coh_ops_len, (uint8_t*)ack_recv_ops); if (acks_polled > 0) { /// Batch ACKs to KVS cr_batch_ops_to_KVS(Acks, (uint8_t*)ack_recv_ops, acks_polled, sizeof(spacetime_ack_t), ops); stop_latency_of_completed_writes(ops, worker_lid, &stopwatch_for_req_latency); } if (machine_id != head_id()) { /// FWD ACKs to previous node if not the Head wings_issue_pkts( ack_ud_c, NULL, (uint8_t*)ack_recv_ops, acks_polled, sizeof(spacetime_ack_t), NULL, ack_fwd_skip_or_get_sender_id, ack_fwd_modify_elem_after_send, ack_fwd_copy_and_modify_elem); if (ENABLE_ASSERTIONS) assert(ack_ud_c->stats.send_total_msgs == ack_ud_c->stats.recv_total_msgs - ack_ud_c->num_overflow_msgs); } else /// empty ack_rcv_ops in head node for (int i = 0; i < coh_ops_len; ++i) ack_recv_ops[i].opcode = ST_EMPTY; } } w_stats[worker_lid].total_loops++; } return NULL; } ================================================ FILE: src/hades/hades.c ================================================ // // Created by akatsarakis on 12/02/19. // #include "../../include/hades/hades.h" #include typedef struct { hades_view_t* ctx_last_local_view; uint8_t dst_id; } hades_view_wrapper_w_dst_id_t; int hades_skip_or_get_dst_id(uint8_t* req) { return ((hades_view_wrapper_w_dst_id_t*)req)->dst_id; } void hades_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req) { hades_view_wrapper_w_dst_id_t* last_local_view = (hades_view_wrapper_w_dst_id_t*)triggering_req; hades_view_t* send_hbt = (hades_view_t*)(msg_to_send - 1); *send_hbt = *last_local_view->ctx_last_local_view; } int hades_crd_skip_or_get_sender_id(uint8_t* req) { hades_view_t* req_hbt = (hades_view_t*)req; return req_hbt->node_id; // always send crd } static inline void print_send_hbt(ud_channel_t* hbeat_c, hades_ctx_t* ctx) { colored_printf(YELLOW, "Send view[%lu]: {node %d, epoch_id %d} ", hbeat_c->stats.send_total_msgs, ctx->intermediate_local_view.node_id, ctx->intermediate_local_view.epoch_id); bv_print_enhanced(ctx->curr_g_membership); printf("\n"); } static inline void print_recved_hbts(ud_channel_t* hbeat_c, hades_view_t* hbt_array, uint16_t no_hbts) { for (int i = 0; i < no_hbts; ++i) { colored_printf(GREEN, "Recved view[%lu]: {node %d, epoch_id %d} ", hbeat_c->stats.recv_total_msgs, hbt_array[i].node_id, hbt_array[i].epoch_id); bv_print_enhanced(hbt_array[i].view); printf("\n"); } } static inline uint8_t majority_of_nodes(hades_ctx_t* ctx) { assert(ctx->max_num_nodes > 1); return (uint8_t)(ctx->max_num_nodes == 2 ? 2 : (ctx->max_num_nodes / 2) + 1); } static inline void check_if_majority_is_rechable(hades_ctx_t* h_ctx) { if (bv_no_setted_bits(h_ctx->last_local_view.view) >= majority_of_nodes(h_ctx) && bv_no_setted_bits(h_ctx->intermediate_local_view.view) < majority_of_nodes(h_ctx)) { colored_printf(RED, "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); colored_printf(RED, "~ [HADES WARNING]: I cannot reach a majority ! ~\n"); colored_printf(RED, "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); colored_printf(YELLOW, "Last membership (epoch %d): ", h_ctx->intermediate_local_view.epoch_id); bv_print_enhanced(h_ctx->curr_g_membership); colored_printf(YELLOW, "My current view: "); bv_print_enhanced(h_ctx->intermediate_local_view.view); colored_printf(RED, "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); } } static inline uint8_t skip_to_apply_fake_link_failure(uint8_t node_id) { static uint8_t ts_is_inited = 0; static uint8_t link_has_failed = 0; static struct timespec ts_fake_link_failure; if ((machine_id == FAKE_LINK_FAILURE_NODE_A && node_id == FAKE_LINK_FAILURE_NODE_B) || (!FAKE_ONE_WAY_LINK_FAILURE && node_id == FAKE_LINK_FAILURE_NODE_A && machine_id == FAKE_LINK_FAILURE_NODE_B)) { if (ts_is_inited == 0) { get_rdtsc_timespec(&ts_fake_link_failure); ts_is_inited = 1; } if (time_elapsed_in_sec(ts_fake_link_failure) > FAKE_LINK_FAILURE_AFTER_SEC && time_elapsed_in_sec(ts_fake_link_failure) < STOP_FAKE_LINK_FAILURE_AFTER_SEC) { if (link_has_failed == 0) { colored_printf(RED, "%sLink failure between node %d and %d\n", FAKE_ONE_WAY_LINK_FAILURE ? "One-way " : "", FAKE_LINK_FAILURE_NODE_A, FAKE_LINK_FAILURE_NODE_B); link_has_failed = 1; } return 1; } } return 0; } static inline uint8_t is_in_membership(hades_ctx_t* h_ctx, uint8_t node_id) { return bv_bit_get(h_ctx->curr_g_membership, node_id); } // Skip iterations for arbitration: static inline uint8_t skip_arbitration(hades_ctx_t* h_ctx, uint8_t i) { if (i == machine_id) return 1; // 1. my local machine id if (!h_ctx->recved_views_flag[i]) return 1; // 2. machine ids that I have not received a view // if(!is_in_membership(h_ctx, i)) return 1; // 3. machine ids that are // not currently in the group membership if (h_ctx->remote_recved_views[i].have_ostracised_for_dst_node == 1) return 1; // 3. this node has not already ostracise someone for me if (!bv_bit_get(h_ctx->remote_recved_views[i] .view, // 4. If my node id does not exist in their view machine_id)) return 1; return 0; } // In case of a link failure (either both or one way) between nodes A and B. // Rest of nodes would be able to detect such a failure using its received views // and resolve this deterministically by choosing the one with the highest node // id to be expelled from the group membership. Once a node is voted to be // expelled by the majority of nodes it gets removed from the membership, this // method is inspired by the "ostracism" procedure under the Athenian democracy // in which any citizen could be expelled from the city of Athens for ten years. // If a node has ostracised somebody for me I cannot ostracised somebody for him static inline void view_arbitration_via_ostracism(hades_ctx_t* h_ctx) { for (uint8_t i = 0; i < h_ctx->max_num_nodes; ++i) h_ctx->have_ostracized_for[i] = 0; for (uint8_t i = 0; i < h_ctx->max_num_nodes; ++i) { if (skip_arbitration(h_ctx, i)) continue; for (uint8_t j = 0; j < h_ctx->max_num_nodes; ++j) { if (i >= j) continue; // for efficiency we do not need to check those if (skip_arbitration(h_ctx, j)) continue; uint8_t i_view_of_j = bv_bit_get(h_ctx->remote_recved_views[i].view, j); uint8_t j_view_of_i = bv_bit_get(h_ctx->remote_recved_views[j].view, i); if (i_view_of_j == 0 || j_view_of_i == 0) { // by default always ostracise this to the Max(i, j) --> j is always > i // unless it's an one way failure from the opposite side where we have // to ostracise i uint8_t node_to_ostracise = i_view_of_j == 1 ? i : j; uint8_t node_to_ostracised_for = i_view_of_j == 1 ? j : i; h_ctx->recved_views_flag[node_to_ostracise] = 0; h_ctx->have_ostracized_for[node_to_ostracised_for] = 1; bv_bit_reset(&h_ctx->intermediate_local_view.view, node_to_ostracise); // yellow_printf("Ostracism: between nodes %d-%d --> %d // is ostracized\n", i, j, node_to_ostracise); printf("My // view: (epoch %d)\n", // h_ctx->intermediate_local_view.epoch_id); // bv_print_enhanced(h_ctx->intermediate_local_view.view); } } } } static inline uint8_t get_max_received_epoch_id(hades_ctx_t* h_ctx) { uint8_t max_epoch_id = 0; for (int i = 0; i < h_ctx->max_num_nodes; ++i) if (h_ctx->recved_views_flag[i] == 1 && h_ctx->remote_recved_views[i].epoch_id > max_epoch_id) max_epoch_id = h_ctx->remote_recved_views[i].epoch_id; return max_epoch_id; } static inline void update_view_n_membership(hades_ctx_t* h_ctx) { if (time_elapsed_in_ms(h_ctx->ts_last_view_change) > h_ctx->update_local_view_every_ms) { get_rdtsc_timespec(&h_ctx->ts_last_view_change); // Reset timer uint8_t views_aggreeing = 1; // (always agree with my local view) uint8_t same_w_local_membership = 0; uint16_t max_epoch_id = h_ctx->intermediate_local_view.epoch_id; if (ENABLE_ARBITRATION) view_arbitration_via_ostracism(h_ctx); // if view has changed update ctx if (!bv_are_equal(h_ctx->intermediate_local_view.view, h_ctx->curr_g_membership) || get_max_received_epoch_id(h_ctx) > h_ctx->intermediate_local_view.epoch_id) { for (int i = 0; i < h_ctx->max_num_nodes; ++i) { if (i == machine_id) continue; if (h_ctx->recved_views_flag[i] == 0) continue; if (bv_are_equal(h_ctx->intermediate_local_view.view, h_ctx->remote_recved_views[i].view)) { views_aggreeing++; if (max_epoch_id < h_ctx->remote_recved_views[i].epoch_id) { max_epoch_id = h_ctx->remote_recved_views[i].epoch_id; same_w_local_membership = h_ctx->remote_recved_views[i].same_w_local_membership; } } h_ctx->recved_views_flag[i] = 0; // reset the received flag } if (views_aggreeing >= majority_of_nodes(h_ctx)) { h_ctx->intermediate_local_view.epoch_id = (uint8_t)(max_epoch_id + (same_w_local_membership == 1 ? 0 : 1)); bv_copy(&h_ctx->curr_g_membership, h_ctx->intermediate_local_view.view); // printf("Max epoch id: %d, same_w_local_membership: // %d\n", // max_epoch_id, same_w_local_membership); colored_printf(YELLOW, "[HADES] MEMBERSHIP CHANGE --> [epoch %d], ", h_ctx->intermediate_local_view.epoch_id); bv_print(h_ctx->curr_g_membership); printf("\n"); // bv_print_enhanced(h_ctx->curr_g_membership); } } check_if_majority_is_rechable(h_ctx); // update last local view h_ctx->last_local_view = h_ctx->intermediate_local_view; h_ctx->last_local_view.same_w_local_membership = bv_are_equal(h_ctx->last_local_view.view, h_ctx->curr_g_membership); // Reset local view bv_reset_all(&h_ctx->intermediate_local_view.view); bv_bit_set(&h_ctx->intermediate_local_view.view, (uint8_t)machine_id); } } static inline void issue_heartbeats(hades_wings_ctx_t* hw_ctx) { hades_ctx_t* h_ctx = &hw_ctx->ctx; hades_view_wrapper_w_dst_id_t last_local_view; last_local_view.ctx_last_local_view = &h_ctx->last_local_view; for (uint8_t i = 0; i < h_ctx->max_num_nodes; ++i) { h_ctx->last_local_view.have_ostracised_for_dst_node = h_ctx->have_ostracized_for[i]; if (i == machine_id) continue; if (FAKE_LINK_FAILURE && skip_to_apply_fake_link_failure(i)) continue; last_local_view.dst_id = i; if (time_elapsed_in_us(h_ctx->ts_last_send[i]) > h_ctx->send_view_every_us) { // Reset a tmp timer in case the send fails due to not enough crds struct timespec ts_last_send_tmp; get_rdtsc_timespec(&ts_last_send_tmp); uint8_t send_failed = wings_issue_pkts( hw_ctx->hviews_c, NULL, (uint8_t*)&last_local_view, 1, sizeof(hades_view_wrapper_w_dst_id_t), NULL, hades_skip_or_get_dst_id, wings_NOP_modify_elem_after_send, hades_copy_and_modify_elem); if (!send_failed) h_ctx->ts_last_send[i] = ts_last_send_tmp; // print_send_hbt(hw_ctx->hviews_c, h_ctx); } } } // static inline void update_view_and_issue_hbs(hades_wings_ctx_t* hw_ctx) { update_view_n_membership(&hw_ctx->ctx); issue_heartbeats(hw_ctx); } // static inline uint16_t poll_for_remote_views(hades_wings_ctx_t* hw_ctx) { hades_ctx_t* h_ctx = &hw_ctx->ctx; // Poll for membership send uint16_t views_polled = wings_poll_buff_and_post_recvs( hw_ctx->hviews_c, h_ctx->max_views_to_poll, (uint8_t*)h_ctx->poll_buff); // print_recved_hbts(hw_ctx->hviews_c, h_ctx->poll_buff, views_polled); for (int i = 0; i < views_polled; ++i) { uint8_t sender_id = h_ctx->poll_buff[i].node_id; h_ctx->recved_views_flag[sender_id] = 1; h_ctx->remote_recved_views[sender_id] = h_ctx->poll_buff[i]; bv_bit_set(&h_ctx->intermediate_local_view.view, sender_id); // In case somebody tries to rejoin if (h_ctx->last_local_view.epoch_id > 1) if (h_ctx->poll_buff[i].epoch_id == 0 && hw_ctx->hviews_c->credits_per_channels[sender_id] == 0) { /// Need to reset its credits and reconfigure the qps to start sending /// views again Warning: currently we share qp info via memcache so if /// node storing memcache (e.g. houston) /// fails we cannot make him re-join (prev qp info are lost) printf("Resetting credits and reconfiguring ibv_qps for channel: %d\n", sender_id); wings_reset_credits(hw_ctx->hviews_c, sender_id); wings_reconfigure_wrs_ah(hw_ctx->hviews_c, sender_id); } } wings_issue_credits(hw_ctx->hviews_crd_c, NULL, (uint8_t*)h_ctx->poll_buff, views_polled, sizeof(hades_view_t), hades_crd_skip_or_get_sender_id, wings_NOP_modify_elem_after_send); return views_polled; } void* hades_loop_only_thread(void* hades_wings_ctx) { hades_wings_ctx_t* hw_ctx = hades_wings_ctx; uint64_t no_iters = 0; while (true) { /// Print every X iteration (Mainly for dbging) no_iters++; if (no_iters % M_32 == 0) { // printf("My view: (epoch %d)\n", // hw_ctx->ctx.intermediate_local_view.epoch_id); // bv_print_enhanced(hw_ctx->ctx.intermediate_local_view.view); } /// Main loop update_view_and_issue_hbs(hw_ctx); poll_for_remote_views(hw_ctx); } } void* hades_full_thread(void* node_id) { ////////////////////////////////// /// failure detector context init ////////////////////////////////// /// Wings (rdma communication) init ud_channel_t* ud_c_ptrs[2]; ud_channel_t ud_channels[2]; for (int i = 0; i < 2; ++i) ud_c_ptrs[i] = &ud_channels[i]; ud_channel_t* hviews_c = ud_c_ptrs[0]; ud_channel_t* hviews_crd_c = ud_c_ptrs[1]; // other Vars uint8_t machine_num = 3; uint16_t worker_lid = 0; uint16_t max_views_to_poll = 10; uint32_t send_view_every_us = 100; uint32_t update_local_view_ms = 10; uint8_t _node_id = *((uint8_t*)node_id); hades_wings_ctx_t w_ctx; hades_wings_ctx_init(&w_ctx, _node_id, machine_num, max_views_to_poll, send_view_every_us, update_local_view_ms, hviews_c, hviews_crd_c, worker_lid); wings_setup_channel_qps_and_recvs(ud_c_ptrs, 2, NULL, 0); hades_loop_only_thread(&w_ctx); return NULL; } ================================================ FILE: src/hades/test.c ================================================ // // Created by akatsarakis on 21/05/19. // #include #include "../../include/hades/hades.h" int main(int argc, char* argv[]) { machine_id = -1; static struct option opts[] = { {.name = "machine-id", .has_arg = 1, .val = 'm'}, {.name = "dev-name", .has_arg = 1, .val = 'd'}, {0}}; /* Parse and check arguments */ while (1) { int c = getopt_long(argc, argv, "m:d:", opts, NULL); if (c == -1) { break; } switch (c) { case 'm': machine_id = atoi(optarg); break; case 'd': memcpy(dev_name, optarg, strlen(optarg)); break; default: printf("Invalid argument %d\n", c); assert(false); } } hades_full_thread(&machine_id); } ================================================ FILE: src/hermes/hermesKV.c ================================================ // // Created by akatsarakis on 07/03/19. // #include #include ////////////////////////////////////////////////// /////////////////////// HERMES KVS (SPACETIME) ////////////////////////////////////////////////// //////////// Assertion functions static inline void hermes_assertions_begin_inv(spacetime_inv_t* inv_ptr) { assert(inv_ptr->op_meta.ts.version % 2 == 0); assert(inv_ptr->op_meta.opcode == ST_OP_INV || inv_ptr->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE); assert(inv_ptr->op_meta.val_len == (ST_VALUE_SIZE >> SHIFT_BITS)); assert(remote_machine_num != 1 || inv_ptr->op_meta.sender == remote_machine_num - machine_id); assert(remote_machine_num != 1 || inv_ptr->op_meta.ts.tie_breaker_id == remote_machine_num - machine_id); // red_printf("INVs: Ops[%d]vvv hash(1st 8B):%" PRIu64 " // version: %d, tie: %d\n", I, // ((uint64_t *) &(*op)[I].key)[0], //(*op)[I].version, //(*op)[I].tie_breaker_id); } static inline void hermes_assertions_begin_ack(spacetime_ack_t* ack_ptr) { assert(ack_ptr->ts.version % 2 == 0); assert(remote_machine_num != 1 || ack_ptr->sender == remote_machine_num - machine_id); assert(ack_ptr->opcode == ST_OP_ACK || ack_ptr->opcode == ST_OP_INV_ABORT || ack_ptr->opcode == ST_OP_MEMBERSHIP_CHANGE); /// WARNING the following assertion is incorrect for write replays // assert(group_membership.num_of_alive_remotes != MAX_REMOTE_MACHINES || // ack_ptr->opcode == ST_OP_INV_ABORT || // ack_ptr->ts.tie_breaker_id == machine_id || // (ENABLE_VIRTUAL_NODE_IDS && ack_ptr->ts.tie_breaker_id % // MAX_MACHINE_NUM == machine_id)); // yellow_printf("ACKS: Ops[%d]vvv hash(1st 8B):%" PRIu64 " // version: %d, tie: %d\n", I, // ((uint64_t *) &(*op)[I].key)[0], //(*op)[I].version, //(*op)[I].tie_breaker_id); } static inline void hermes_assertions_begin_val(spacetime_val_t* val_ptr) { assert(val_ptr->ts.version % 2 == 0); assert(val_ptr->opcode == ST_OP_VAL); assert(remote_machine_num != 1 || val_ptr->sender == remote_machine_num - machine_id); assert(remote_machine_num != 1 || val_ptr->ts.tie_breaker_id == remote_machine_num - machine_id); // green_printf("VALS: Ops[%d]vvv hash(1st 8B):%" PRIu64 " // version: %d, tie: %d\n", I, // ((uint64_t *) &(*op)[I].key)[0], //(*op)[I].version, //(*op)[I].tie_breaker_id); } static inline void hermes_assertions_end_read_write_ops(spacetime_op_t* read_write_op) { for (int i = 0; i < max_batch_size; ++i) assert(read_write_op[i].op_meta.opcode == ST_OP_GET || read_write_op[i].op_meta.state == ST_MISS || read_write_op[i].op_meta.state == ST_PUT_STALL || read_write_op[i].op_meta.state == ST_PUT_SUCCESS || read_write_op[i].op_meta.state == ST_PUT_COMPLETE || read_write_op[i].op_meta.state == ST_IN_PROGRESS_PUT || read_write_op[i].op_meta.state == ST_RMW_STALL || read_write_op[i].op_meta.state == ST_RMW_ABORT || read_write_op[i].op_meta.state == ST_RMW_SUCCESS || read_write_op[i].op_meta.state == ST_RMW_COMPLETE || read_write_op[i].op_meta.state == ST_IN_PROGRESS_RMW || read_write_op[i].op_meta.state == ST_OP_MEMBERSHIP_CHANGE || /// TODO check this read_write_op[i].op_meta.state == ST_IN_PROGRESS_REPLAY); } /// Helper functions // TODO inlining this function by hand can give higher xPut ~5% on 20% write // rate static inline __attribute__((always_inline)) void hermes_lock_free_read_obj_meta(spacetime_object_meta* lock_free_read_meta, spacetime_object_meta* curr_meta) { uint32_t debug_cntr = 0; do { // Lock free read of keys meta if (ENABLE_ASSERTIONS) { debug_cntr++; if (debug_cntr == M_4) { printf("Worker stuck on a lock-free read (for ACK)\n"); debug_cntr = 0; } } *lock_free_read_meta = *curr_meta; } while (!cctrl_timestamp_is_same_and_valid(&lock_free_read_meta->cctrl, &curr_meta->cctrl)); } static uint64_t g_seed = 0xdeadbeef; static inline void hermes_update_actions_n_unlock(spacetime_op_t* op_ptr, struct mica_op* kv_ptr, spacetime_object_meta* curr_meta, uint8_t idx, spacetime_group_membership curr_membership, uint8_t RMW_flag) { if (ENABLE_ASSERTIONS) { assert(RMW_flag == 0 || ENABLE_RMWs); assert(idx < ST_OP_BUFFER_INDEX_EMPTY); } /// Copy value and update len uint8_t* kv_value_ptr = (uint8_t*)&curr_meta[1]; memcpy(kv_value_ptr, op_ptr->value, ST_VALUE_SIZE); kv_ptr->val_len = set_val_len(&op_ptr->op_meta); /// update keys metadata and unlock curr_meta->RMW_flag = RMW_flag; curr_meta->state = WRITE_STATE; curr_meta->op_buffer_index = (uint8_t)idx; curr_meta->last_local_write_ts.version = curr_meta->cctrl.ts.version + (!ENABLE_RMWs || RMW_flag == 1 ? 1 : 3); // update group membership mask bv_copy((bit_vector_t*)&curr_meta->ack_bv, curr_membership.w_ack_init); uint8_t v_node_id = (uint8_t)(!ENABLE_VIRTUAL_NODE_IDS ? machine_id : machine_id + machine_num * (hrd_fastrand(&g_seed) % VIRTUAL_NODE_IDS_PER_NODE)); curr_meta->last_local_write_ts.tie_breaker_id = v_node_id; if (!ENABLE_RMWs || RMW_flag == 1) cctrl_unlock_inc_version(&curr_meta->cctrl, v_node_id, (uint32_t*)&(op_ptr->op_meta.ts.version)); else cctrl_unlock_inc_version_by_three(&curr_meta->cctrl, v_node_id, (uint32_t*)&(op_ptr->op_meta.ts.version)); /// update op_ptr metadata op_ptr->RMW_flag = RMW_flag; op_ptr->op_meta.state = RMW_flag == 1 ? ST_RMW_SUCCESS : ST_PUT_SUCCESS; op_ptr->op_meta.ts.tie_breaker_id = v_node_id; } static inline void hermes_local_state_to_op(spacetime_op_t* op_ptr, spacetime_object_meta* keys_meta) { uint8_t* kv_value_ptr = (uint8_t*)&keys_meta[1]; op_ptr->RMW_flag = keys_meta->RMW_flag; op_ptr->op_meta.state = ST_REPLAY_SUCCESS; op_ptr->op_meta.ts.version = keys_meta->cctrl.ts.version - 1; op_ptr->op_meta.ts.tie_breaker_id = keys_meta->cctrl.ts.tie_breaker_id; op_ptr->op_meta.val_len = ST_VALUE_SIZE >> SHIFT_BITS; memcpy(op_ptr->value, kv_value_ptr, ST_VALUE_SIZE); } static inline void hermes_write_replay_actions(spacetime_op_t* op_ptr, uint8_t idx, spacetime_object_meta* keys_meta, spacetime_group_membership curr_membership) { if (ENABLE_ASSERTIONS) assert(idx < ST_OP_BUFFER_INDEX_EMPTY); colored_printf(YELLOW, "Write replay for i: %d\n", idx); /// update keys metadata and unlock keys_meta->state = REPLAY_STATE; keys_meta->op_buffer_index = (uint8_t)idx; keys_meta->last_local_write_ts.version = keys_meta->cctrl.ts.version - 1; keys_meta->last_local_write_ts.tie_breaker_id = keys_meta->cctrl.ts.tie_breaker_id; // update group membership mask for replay acks bv_copy((bit_vector_t*)&keys_meta->ack_bv, curr_membership.w_ack_init); /// update op_ptr metadata hermes_local_state_to_op(op_ptr, keys_meta); } static inline void hermes_check_membership_n_write_replay_actions( spacetime_op_t* op_ptr, uint8_t idx, spacetime_object_meta* keys_meta, spacetime_group_membership curr_membership) { uint8_t node_id = (uint8_t)(!ENABLE_VIRTUAL_NODE_IDS ? keys_meta->last_writer_id : keys_meta->last_writer_id % machine_num); if (node_is_in_membership(curr_membership, node_id)) op_ptr->op_meta.state = ST_GET_STALL; else if (keys_meta->op_buffer_index == ST_OP_BUFFER_INDEX_EMPTY) /// stall replay: until all acks from last write arrive /// on multiple threads we can't complete writes / replays on VAL hermes_write_replay_actions(op_ptr, idx, keys_meta, curr_membership); } static inline void hermes_marshal_write_coalesce_optimization(spacetime_op_t* op_ptr, uint16_t curr_ts_version) { if (ENABLE_ASSERTIONS) assert(op_ptr->op_meta.opcode == ST_OP_PUT); if (ENABLE_WRITE_COALESCE_TO_THE_SAME_KEY_IN_SAME_NODE && op_ptr->op_meta.ts.version == 0) { // if its the first time we stall on this read store the timestamp op_ptr->op_meta.ts.version = curr_ts_version; op_ptr->op_meta.state = ST_IN_PROGRESS_PUT; } } static inline void hermes_complete_coalesced_write(spacetime_op_t* op_ptr, uint16_t curr_ts) { if (ENABLE_ASSERTIONS) assert(op_ptr->op_meta.opcode == ST_OP_PUT); if (ENABLE_WRITE_COALESCE_TO_THE_SAME_KEY_IN_SAME_NODE && op_ptr->op_meta.state == ST_PUT_STALL) if (op_ptr->op_meta.ts.version > 0 && op_ptr->op_meta.ts.version + 1 < curr_ts) { // if the timestamp we saw initially has smaller than 2 versions it means // that the local write we coalesced with is completed op_ptr->op_meta.state = ST_PUT_COMPLETE; } } static inline void hermes_complete_hot_read_optimization(spacetime_op_t* op_ptr, timestamp_t ts) { if (ENABLE_READ_COMPLETE_AFTER_VAL_RECV_OF_HOT_REQS && op_ptr->op_meta.state == ST_GET_STALL) { if (op_ptr->op_meta.ts.version == 0 && op_ptr->op_meta.ts.tie_breaker_id == 0) { // if its the first time we stall on this read store the timestamp op_ptr->op_meta.ts.version = ts.version; op_ptr->op_meta.ts.tie_breaker_id = ts.tie_breaker_id; } else if (op_ptr->op_meta.ts.version + 1 < ts.version) { // if the timestamp we saw initially has smaller than 2 versions complete // the read; // TODO we also need to get the value here op_ptr->op_meta.state = ST_GET_COMPLETE; } } } static inline void hermes_read_actions(spacetime_op_t* op_ptr, struct mica_op* kv_ptr, uint8_t* kv_value_ptr) { memcpy(op_ptr->value, kv_value_ptr, ST_VALUE_SIZE); op_ptr->op_meta.state = ST_GET_COMPLETE; op_ptr->op_meta.val_len = get_val_len(kv_ptr); } //////////// Exec op functions static inline void hermes_exec_read(spacetime_op_t* op_ptr, struct mica_op* kv_ptr, uint8_t idx, spacetime_group_membership curr_membership) { if (ENABLE_ASSERTIONS) assert(op_ptr->op_meta.opcode == ST_OP_GET); timestamp_t curr_ts; spacetime_object_meta prev_meta; spacetime_object_meta* keys_meta = (spacetime_object_meta*)kv_ptr->value; uint8_t* kv_value_ptr = (uint8_t*)&keys_meta[1]; // Lock free reads through versioning (successful when version is even) uint8_t was_locked_read = 0; op_ptr->op_meta.state = ST_EMPTY; do { prev_meta = *keys_meta; curr_ts = keys_meta->cctrl.ts; // switch template with all states switch (keys_meta->state) { case VALID_STATE: hermes_read_actions(op_ptr, kv_ptr, kv_value_ptr); break; case INVALID_WRITE_STATE: case WRITE_STATE: case REPLAY_STATE: op_ptr->op_meta.state = ST_GET_STALL; break; default: was_locked_read = 1; cctrl_lock(&keys_meta->cctrl); curr_ts = keys_meta->cctrl.ts; curr_ts.version -= 1; // WARNING: when locking we do version++ switch (keys_meta->state) { case VALID_STATE: hermes_read_actions(op_ptr, kv_ptr, kv_value_ptr); break; case INVALID_WRITE_STATE: case WRITE_STATE: case REPLAY_STATE: op_ptr->op_meta.state = ST_GET_STALL; break; case INVALID_STATE: hermes_check_membership_n_write_replay_actions( op_ptr, idx, keys_meta, curr_membership); break; default: assert(0); } cctrl_unlock_dec_version(&keys_meta->cctrl); break; } } while ( !cctrl_timestamp_is_same_and_valid(&prev_meta.cctrl, &keys_meta->cctrl) && was_locked_read == 0); hermes_complete_hot_read_optimization(op_ptr, curr_ts); } static inline void hermes_exec_write(spacetime_op_t* op_ptr, struct mica_op* kv_ptr, uint8_t idx, spacetime_group_membership curr_membership) { if (ENABLE_ASSERTIONS) { assert(op_ptr->op_meta.opcode == ST_OP_PUT); assert(op_ptr->op_meta.val_len == (ST_VALUE_SIZE >> SHIFT_BITS)); } spacetime_object_meta* keys_meta = (spacetime_object_meta*)kv_ptr->value; op_ptr->op_meta.state = ST_EMPTY; cctrl_lock(&keys_meta->cctrl); uint16_t curr_version = (uint16_t)(keys_meta->cctrl.ts.version - 1); switch (keys_meta->state) { case VALID_STATE: case INVALID_STATE: if (keys_meta->op_buffer_index != ST_OP_BUFFER_INDEX_EMPTY) { /// stall write: until all acks from last write arrive /// on multiple threads we can't complete writes / replays on VAL cctrl_unlock_dec_version(&keys_meta->cctrl); hermes_marshal_write_coalesce_optimization(op_ptr, curr_version); } else hermes_update_actions_n_unlock(op_ptr, kv_ptr, keys_meta, idx, curr_membership, 0); break; case INVALID_WRITE_STATE: case WRITE_STATE: hermes_marshal_write_coalesce_optimization(op_ptr, curr_version); case REPLAY_STATE: cctrl_unlock_dec_version(&keys_meta->cctrl); break; default: assert(0); } // Fill this deterministic stuff after releasing the lock if (op_ptr->op_meta.state != ST_PUT_SUCCESS) op_ptr->op_meta.state = ST_PUT_STALL; hermes_complete_coalesced_write(op_ptr, curr_version); } static inline void hermes_exec_rmw(spacetime_op_t* op_ptr, struct mica_op* kv_ptr, uint8_t idx, spacetime_group_membership curr_membership) { spacetime_object_meta* keys_meta = (spacetime_object_meta*)kv_ptr->value; if (ENABLE_ASSERTIONS) { assert(op_ptr->op_meta.opcode == ST_OP_RMW); assert(op_ptr->op_meta.state == ST_NEW || op_ptr->op_meta.state == ST_RMW_STALL || op_ptr->op_meta.state == ST_IN_PROGRESS_RMW); assert(op_ptr->op_meta.val_len == (ST_VALUE_SIZE >> SHIFT_BITS)); } if (op_ptr->op_meta.state == ST_IN_PROGRESS_RMW) { spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value; spacetime_object_meta lock_free_meta; hermes_lock_free_read_obj_meta(&lock_free_meta, curr_meta); if (timestamp_is_smaller(op_ptr->op_meta.ts.version, op_ptr->op_meta.ts.tie_breaker_id, lock_free_meta.cctrl.ts.version, lock_free_meta.cctrl.ts.tie_breaker_id)) { // Abort RMW --> we saw higher TS before gathering all of its acks op_ptr->op_meta.state = ST_RMW_ABORT; cctrl_lock(&keys_meta->cctrl); if (timestamp_is_equal( op_ptr->op_meta.ts.version, op_ptr->op_meta.ts.tie_breaker_id, lock_free_meta.last_local_write_ts.version, lock_free_meta.last_local_write_ts.tie_breaker_id)) { if (ENABLE_ASSERTIONS) assert(idx == curr_meta->op_buffer_index); curr_meta->op_buffer_index = ST_OP_BUFFER_INDEX_EMPTY; } cctrl_unlock_dec_version(&keys_meta->cctrl); } } else { op_ptr->op_meta.state = ST_EMPTY; cctrl_lock(&keys_meta->cctrl); switch (keys_meta->state) { case VALID_STATE: if (keys_meta->op_buffer_index != ST_OP_BUFFER_INDEX_EMPTY) /// stall write: until all acks from last write arrive /// on multiple threads we can't complete writes / replays on VAL cctrl_unlock_dec_version(&keys_meta->cctrl); else hermes_update_actions_n_unlock(op_ptr, kv_ptr, keys_meta, idx, curr_membership, 1); break; case INVALID_STATE: hermes_check_membership_n_write_replay_actions(op_ptr, idx, keys_meta, curr_membership); // Warning: Do not break case INVALID_WRITE_STATE: case WRITE_STATE: case REPLAY_STATE: cctrl_unlock_dec_version(&keys_meta->cctrl); break; default: assert(0); break; } // Fill this deterministic stuff after releasing the lock if (op_ptr->op_meta.state != ST_RMW_SUCCESS && op_ptr->op_meta.state != ST_REPLAY_SUCCESS) op_ptr->op_meta.state = ST_RMW_STALL; } } static inline void hermes_exec_check_update_completion(spacetime_op_t* op_ptr, struct mica_op* kv_ptr, uint8_t idx, spacetime_group_membership curr_membership) { spacetime_object_meta lock_free_read_meta; spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value; hermes_lock_free_read_obj_meta(&lock_free_read_meta, curr_meta); if (ENABLE_ASSERTIONS) { assert(op_ptr->op_meta.opcode == ST_OP_PUT || op_ptr->op_meta.opcode == ST_OP_RMW || op_ptr->op_meta.state == ST_IN_PROGRESS_REPLAY); assert(!timestamp_is_smaller(lock_free_read_meta.cctrl.ts.version, lock_free_read_meta.cctrl.ts.tie_breaker_id, op_ptr->op_meta.ts.version, op_ptr->op_meta.ts.tie_breaker_id)); } if (is_last_ack(lock_free_read_meta.ack_bv, curr_membership)) { // if last local write completed cctrl_lock(&curr_meta->cctrl); if (is_last_ack(curr_meta->ack_bv, curr_membership)) { if (ENABLE_ASSERTIONS) assert(curr_meta->op_buffer_index == idx); curr_meta->op_buffer_index = ST_OP_BUFFER_INDEX_EMPTY; // reset the write buff index switch (curr_meta->state) { case INVALID_WRITE_STATE: curr_meta->state = INVALID_STATE; /// Warning break omitted intentionally case VALID_STATE: case INVALID_STATE: op_ptr->op_meta.state = op_ptr->op_meta.opcode == ST_OP_PUT ? ST_PUT_COMPLETE : ST_RMW_COMPLETE; break; case WRITE_STATE: case REPLAY_STATE: op_ptr->op_meta.ts.version = curr_meta->cctrl.ts.version - 1; // -1 because of seqlock does version + 1 op_ptr->op_meta.ts.tie_breaker_id = curr_meta->cctrl.ts.tie_breaker_id; if (curr_meta->state == WRITE_STATE) { op_ptr->op_meta.state = op_ptr->op_meta.opcode == ST_OP_PUT ? ST_PUT_COMPLETE_SEND_VALS : ST_RMW_COMPLETE_SEND_VALS; } else { if (ENABLE_ASSERTIONS) assert(op_ptr->op_meta.state == ST_IN_PROGRESS_REPLAY); op_ptr->op_meta.state = DISABLE_VALS_FOR_DEBUGGING == 1 ? ST_GET_COMPLETE : ST_REPLAY_COMPLETE_SEND_VALS; } curr_meta->state = VALID_STATE; break; default: assert(0); } } cctrl_unlock_dec_version(&curr_meta->cctrl); } } //////////// Exec protocol action functions static inline void hermes_exec_inv(spacetime_inv_t* inv_ptr, struct mica_op* kv_ptr, spacetime_op_t* read_write_op) { if (ENABLE_ASSERTIONS) assert(inv_ptr->op_meta.opcode == ST_OP_INV || inv_ptr->op_meta.opcode == ST_OP_INV_ABORT); spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value; uint8_t* kv_value_ptr = (uint8_t*)&curr_meta[1]; spacetime_object_meta lock_free_meta; hermes_lock_free_read_obj_meta(&lock_free_meta, curr_meta); // proceed iff remote.TS >= local.TS || inv is for an RMW to respond with an // INV-abort if (!timestamp_is_smaller(inv_ptr->op_meta.ts.version, inv_ptr->op_meta.ts.tie_breaker_id, lock_free_meta.cctrl.ts.version, lock_free_meta.cctrl.ts.tie_breaker_id) || (ENABLE_RMWs && inv_ptr->RMW_flag == 1)) { // Lock and check again if inv TS > local timestamp cctrl_lock(&curr_meta->cctrl); /// Warning: use curr_meta->ts.version - 1 bellow since seqlock increases /// curr_meta->ts.version by 1 if (timestamp_is_smaller( curr_meta->cctrl.ts.version - 1, curr_meta->cctrl.ts.tie_breaker_id, inv_ptr->op_meta.ts.version, inv_ptr->op_meta.ts.tie_breaker_id)) { // printf("Received an invalidation with >= // timestamp\n"); /// Update state switch (curr_meta->state) { case VALID_STATE: curr_meta->state = INVALID_STATE; case INVALID_STATE: case INVALID_WRITE_STATE: break; case WRITE_STATE: case REPLAY_STATE: curr_meta->state = ENABLE_RMWs && curr_meta->RMW_flag == 1 ? INVALID_STATE : INVALID_WRITE_STATE; break; // case REPLAY_STATE: // curr_meta->state = // INVALID_WRITE_STATE; // curr_meta->state = INVALID_STATE; // //recover the read // if(ENABLE_ASSERTIONS){ // assert(curr_meta->op_buffer_index //!= ST_OP_BUFFER_INDEX_EMPTY); // assert(read_write_op[curr_meta->op_buffer_index].state //== ST_IN_PROGRESS_REPLAY); // assert(((uint64_t //*) &read_write_op[curr_meta->op_buffer_index].key)[0] == ((uint64_t //*) //&(*op)[I].key)[0]); // } // read_write_op[curr_meta->op_buffer_index].state //= ST_NEW; curr_meta->op_buffer_index = ST_OP_BUFFER_INDEX_EMPTY; // break; default: assert(0); } if (ENABLE_ASSERTIONS) assert(inv_ptr->op_meta.val_len == (ST_VALUE_SIZE >> SHIFT_BITS)); /// Update Value, TS, RMW_flag and last_writer_id kv_ptr->val_len = KVS_VALUE_SIZE; curr_meta->RMW_flag = inv_ptr->RMW_flag; curr_meta->last_writer_id = inv_ptr->op_meta.sender; memcpy(kv_value_ptr, inv_ptr->value, ST_VALUE_SIZE); cctrl_unlock_custom_version(&curr_meta->cctrl, inv_ptr->op_meta.ts.tie_breaker_id, inv_ptr->op_meta.ts.version); } else if (timestamp_is_equal(curr_meta->cctrl.ts.version - 1, curr_meta->cctrl.ts.tie_breaker_id, inv_ptr->op_meta.ts.version, inv_ptr->op_meta.ts.tie_breaker_id)) { if (curr_meta->state == WRITE_STATE) inv_ptr->op_meta.opcode = ST_INV_OUT_OF_GROUP; curr_meta->last_writer_id = inv_ptr->op_meta.sender; cctrl_unlock_custom_version(&curr_meta->cctrl, inv_ptr->op_meta.ts.tie_breaker_id, inv_ptr->op_meta.ts.version); } else { // TS is Smaller /// Respond with an inv-abort if its an RMW if (ENABLE_RMWs && inv_ptr->RMW_flag == 1) { uint8_t sender_id = inv_ptr->op_meta.sender; hermes_local_state_to_op(inv_ptr, curr_meta); inv_ptr->op_meta.sender = sender_id; inv_ptr->op_meta.opcode = ST_OP_INV_ABORT; colored_printf(RED, "Sending OP_INV_ABORT\n"); } cctrl_unlock_dec_version(&curr_meta->cctrl); } } if (inv_ptr->op_meta.opcode != ST_OP_INV_ABORT && inv_ptr->op_meta.opcode != ST_INV_OUT_OF_GROUP) inv_ptr->op_meta.opcode = ST_INV_SUCCESS; if (ENABLE_ASSERTIONS) assert(inv_ptr->op_meta.opcode == ST_OP_INV_ABORT || inv_ptr->op_meta.opcode == ST_INV_SUCCESS || inv_ptr->op_meta.opcode == ST_INV_OUT_OF_GROUP); } static inline void hermes_exec_ack(spacetime_ack_t* ack_ptr, struct mica_op* kv_ptr, spacetime_group_membership curr_membership, spacetime_op_t* read_write_op) { int op_buff_indx = ST_OP_BUFFER_INDEX_EMPTY; spacetime_object_meta lock_free_read_meta; spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value; hermes_lock_free_read_obj_meta(&lock_free_read_meta, curr_meta); if (ENABLE_ASSERTIONS) assert(!timestamp_is_smaller(lock_free_read_meta.cctrl.ts.version, lock_free_read_meta.cctrl.ts.tie_breaker_id, ack_ptr->ts.version, ack_ptr->ts.tie_breaker_id)); if (timestamp_is_equal( ack_ptr->ts.version, ack_ptr->ts.tie_breaker_id, lock_free_read_meta.last_local_write_ts.version, lock_free_read_meta.last_local_write_ts.tie_breaker_id)) { /// Lock and check again if ack TS == last local write cctrl_lock(&curr_meta->cctrl); if (curr_meta->op_buffer_index != ST_OP_BUFFER_INDEX_EMPTY && timestamp_is_equal(ack_ptr->ts.version, ack_ptr->ts.tie_breaker_id, curr_meta->last_local_write_ts.version, curr_meta->last_local_write_ts.tie_breaker_id)) { bv_bit_set((bit_vector_t*)&curr_meta->ack_bv, ack_ptr->sender); if (is_last_ack(curr_meta->ack_bv, curr_membership)) { // if last local write completed op_buff_indx = curr_meta->op_buffer_index; switch (curr_meta->state) { case VALID_STATE: case INVALID_STATE: ack_ptr->opcode = ST_LAST_ACK_NO_BCAST_SUCCESS; curr_meta->op_buffer_index = ST_OP_BUFFER_INDEX_EMPTY; // reset the write buff index break; case INVALID_WRITE_STATE: curr_meta->state = INVALID_STATE; ack_ptr->opcode = ST_LAST_ACK_NO_BCAST_SUCCESS; curr_meta->op_buffer_index = ST_OP_BUFFER_INDEX_EMPTY; // reset the write buff index break; case WRITE_STATE: case REPLAY_STATE: curr_meta->state = VALID_STATE; ack_ptr->opcode = ST_LAST_ACK_SUCCESS; curr_meta->op_buffer_index = ST_OP_BUFFER_INDEX_EMPTY; // reset the write buff index break; default: assert(0); } } } cctrl_unlock_dec_version(&curr_meta->cctrl); } if (ack_ptr->opcode == ST_LAST_ACK_SUCCESS || ack_ptr->opcode == ST_LAST_ACK_NO_BCAST_SUCCESS) { /// completed read / write --> remove it from the ops buffer if (ENABLE_ASSERTIONS) { assert(op_buff_indx != ST_OP_BUFFER_INDEX_EMPTY); assert(read_write_op[op_buff_indx].op_meta.state == ST_IN_PROGRESS_PUT || read_write_op[op_buff_indx].op_meta.state == ST_IN_PROGRESS_RMW || read_write_op[op_buff_indx].op_meta.state == ST_OP_MEMBERSHIP_CHANGE || read_write_op[op_buff_indx].op_meta.state == ST_IN_PROGRESS_REPLAY); assert(((uint64_t*)&read_write_op[op_buff_indx].op_meta.key)[0] == ((uint64_t*)&ack_ptr->key)[0]); } switch (read_write_op[op_buff_indx].op_meta.opcode) { case ST_OP_GET: read_write_op[op_buff_indx].op_meta.state = ST_NEW; break; case ST_OP_PUT: read_write_op[op_buff_indx].op_meta.state = ST_PUT_COMPLETE; break; case ST_OP_RMW: read_write_op[op_buff_indx].op_meta.state = ST_RMW_COMPLETE; // TODO ad an OP to differentiate between RMW-replay and RMW complete break; default: assert(0); } } if (ack_ptr->opcode != ST_LAST_ACK_SUCCESS) ack_ptr->opcode = ST_ACK_SUCCESS; } static inline void hermes_exec_val(spacetime_val_t* val_ptr, struct mica_op* kv_ptr) { spacetime_object_meta lock_free_read_meta; spacetime_object_meta* curr_meta = (spacetime_object_meta*)kv_ptr->value; hermes_lock_free_read_obj_meta(&lock_free_read_meta, curr_meta); /// lock and proceed iff remote.TS == local.TS if (timestamp_is_equal(lock_free_read_meta.cctrl.ts.version, lock_free_read_meta.cctrl.ts.tie_breaker_id, val_ptr->ts.version, val_ptr->ts.tie_breaker_id)) { /// Lock and check again if still TS == local timestamp cctrl_lock(&curr_meta->cctrl); /// Warning: use op.version + 1 bellow since optik_lock() increases /// curr_meta->version by 1 if (timestamp_is_equal(curr_meta->cctrl.ts.version - 1, curr_meta->cctrl.ts.tie_breaker_id, val_ptr->ts.version, val_ptr->ts.tie_breaker_id)) { if (ENABLE_ASSERTIONS) assert(curr_meta->state != WRITE_STATE); /// WARNING: this should not happen w/o this node /// removed from the group curr_meta->state = VALID_STATE; } cctrl_unlock_dec_version(&curr_meta->cctrl); } val_ptr->opcode = ST_VAL_SUCCESS; } //////////// Skip functions static inline uint8_t hermes_skip_op(spacetime_op_t* op_ptr) { return (uint8_t)((op_ptr->op_meta.state == ST_PUT_SUCCESS || op_ptr->op_meta.state == ST_RMW_SUCCESS || op_ptr->op_meta.state == ST_REPLAY_SUCCESS || op_ptr->op_meta.state == ST_IN_PROGRESS_PUT || // op_ptr->op_meta.state == // ST_IN_PROGRESS_RMW || op_ptr->op_meta.state == ST_IN_PROGRESS_REPLAY || op_ptr->op_meta.state == ST_OP_MEMBERSHIP_CHANGE || op_ptr->op_meta.state == ST_PUT_COMPLETE_SEND_VALS) ? 1 : 0); } static inline uint8_t hermes_skip_op_after_membship_change(spacetime_op_t* op_ptr) { return (uint8_t)((op_ptr->op_meta.state == ST_IN_PROGRESS_PUT || op_ptr->op_meta.state == ST_IN_PROGRESS_RMW || op_ptr->op_meta.state == ST_IN_PROGRESS_REPLAY) ? 0 : 1); } static inline uint8_t hermes_skip_inv(spacetime_inv_t* inv_ptr, int* node_suspected) { if (inv_ptr->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE) { // TODO we need to do this only on the first skip *node_suspected = inv_ptr->value[0]; printf("RECEIVED NODE SUSPICION: %d\n", *node_suspected); return 1; } return 0; } static inline uint8_t hermes_skip_ack(spacetime_ack_t* ack_ptr) { return (uint8_t)((ack_ptr->state == ST_OP_MEMBERSHIP_CHANGE) ? 1 : 0); } //////////// Dispatcher functions static inline uint8_t hermes_skip_dispatcher(enum hermes_batch_type_t type, void* ptr, int* node_suspected) { switch (type) { case local_ops: return hermes_skip_op(ptr); case local_ops_after_membership_change: return hermes_skip_op_after_membship_change(ptr); case invs: return hermes_skip_inv(ptr, node_suspected); case acks: return hermes_skip_ack(ptr); case vals: return 0; default: assert(0); } } static inline void hermes_assertions_begin_dispatcher(enum hermes_batch_type_t type, void* ptr) { if (ENABLE_ASSERTIONS) switch (type) { case local_ops: case local_ops_after_membership_change: break; case invs: hermes_assertions_begin_inv(ptr); break; case acks: if (ENABLE_RMWs == 0) hermes_assertions_begin_ack(ptr); else { spacetime_ack_t* ack_ptr = ptr; if (ack_ptr->opcode == ST_OP_ACK) hermes_assertions_begin_ack(ptr); else if (ack_ptr->opcode == ST_OP_INV_ABORT) { printf("RECVED: inv abort\n"); hermes_assertions_begin_inv(ptr); } else { printf("RECVED: %s\n", code_to_str(ack_ptr->opcode)); assert(0); } } break; case vals: hermes_assertions_begin_val(ptr); break; default: assert(0); } } static inline void hermes_print_dispatcher(enum hermes_batch_type_t type, int op_num, uint8_t thread_id) { if (ENABLE_BATCH_OP_PRINTS) switch (type) { case local_ops: case local_ops_after_membership_change: break; case invs: if (ENABLE_INV_PRINTS && thread_id < MAX_THREADS_TO_PRINT) colored_printf(RED, "[W] Batch INVs (op num: %d)!\n", thread_id, op_num); break; case acks: if (ENABLE_ACK_PRINTS && thread_id < MAX_THREADS_TO_PRINT) colored_printf(RED, "[W%d] Batch ACKs (op num: %d)!\n", thread_id, op_num); break; case vals: if (ENABLE_VAL_PRINTS && thread_id < MAX_THREADS_TO_PRINT) colored_printf(RED, "[W%d] Batch VALs (op num: %d)!\n", thread_id, op_num); break; default: assert(0); } } static inline void hermes_assertions_end_dispatcher(enum hermes_batch_type_t type, spacetime_op_t* read_write_ops) { if (ENABLE_ASSERTIONS) switch (type) { case local_ops: case local_ops_after_membership_change: case invs: break; case acks: hermes_assertions_end_read_write_ops(read_write_ops); break; case vals: break; default: assert(0); } } static inline void hermes_exec_dispatcher(enum hermes_batch_type_t type, void* op_ptr, struct mica_op* kv_ptr, spacetime_group_membership curr_membership, uint8_t idx, spacetime_op_t* read_write_op) { switch (type) { case local_ops: if (((spacetime_op_t*)op_ptr)->op_meta.opcode == ST_OP_GET) hermes_exec_read(op_ptr, kv_ptr, idx, curr_membership); else if (((spacetime_op_t*)op_ptr)->op_meta.opcode == ST_OP_PUT) hermes_exec_write(op_ptr, kv_ptr, idx, curr_membership); else if (ENABLE_RMWs && ((spacetime_op_t*)op_ptr)->op_meta.opcode == ST_OP_RMW) hermes_exec_rmw(op_ptr, kv_ptr, idx, curr_membership); else { printf("Ops[%d]: %s\n", idx, code_to_str(((spacetime_op_t*)op_ptr)->op_meta.opcode)); assert(0); } break; case local_ops_after_membership_change: if (((spacetime_op_t*)op_ptr)->op_meta.opcode == ST_OP_PUT || ((spacetime_op_t*)op_ptr)->op_meta.opcode == ST_OP_RMW || ((spacetime_op_t*)op_ptr)->op_meta.state == ST_IN_PROGRESS_REPLAY) { hermes_exec_check_update_completion(op_ptr, kv_ptr, idx, curr_membership); } else assert(0); break; case invs: hermes_exec_inv(op_ptr, kv_ptr, read_write_op); break; case acks: if (ENABLE_RMWs == 0) hermes_exec_ack(op_ptr, kv_ptr, curr_membership, read_write_op); else { spacetime_ack_t* ack_ptr = op_ptr; if (ack_ptr->opcode == ST_OP_ACK) hermes_exec_ack(op_ptr, kv_ptr, curr_membership, read_write_op); else if (ack_ptr->opcode == ST_OP_INV_ABORT) { /// TODO RMW debugging printf("RECVED: inv abort\n"); hermes_exec_inv(op_ptr, kv_ptr, read_write_op); ack_ptr->opcode = ST_ACK_SUCCESS; } else assert(0); } break; case vals: hermes_exec_val(op_ptr, kv_ptr); break; default: assert(0); } } ////////////////////////////////////////////// //////////// Main HermesKV function ////////////////////////////////////////////// void hermes_batch_ops_to_KVS(enum hermes_batch_type_t type, uint8_t* op_array, int op_num, uint16_t sizeof_op_elem, spacetime_group_membership curr_membership, int* node_suspected, spacetime_op_t* read_write_ops, uint8_t thread_id) { #if SPACETIME_DEBUG == 1 // assert(kv.hash_table != NULL); assert(op_array != NULL); assert(op_num > 0 && op_num <= CACHE_BATCH_SIZE); assert(resp != NULL); #endif #if SPACETIME_DEBUG == 2 for (I = 0; I < op_num; I++) mica_print_op(&(*op_array)[I]); #endif int key_in_store[HERMES_MAX_BATCH_SIZE]; // Is this key in the datastore? unsigned int tag[HERMES_MAX_BATCH_SIZE]; uint64_t bkt[HERMES_MAX_BATCH_SIZE]; struct mica_bkt* bkt_ptr[HERMES_MAX_BATCH_SIZE]; struct mica_op* kv_ptr[HERMES_MAX_BATCH_SIZE]; // Ptr to KV item in log if (ENABLE_ASSERTIONS) { assert(op_num <= HERMES_MAX_BATCH_SIZE); assert(read_write_ops != NULL || type != acks); assert(node_suspected != NULL || type != invs); } hermes_print_dispatcher(type, op_num, thread_id); // We first lookup the key in the datastore. // The first two @I loops work for both GETs and PUTs. for (int I = 0; I < op_num; I++) { spacetime_op_meta_t* op_ptr = (spacetime_op_meta_t*)&op_array[sizeof_op_elem * I]; hermes_assertions_begin_dispatcher(type, op_ptr); if (hermes_skip_dispatcher(type, op_ptr, node_suspected)) continue; bkt[I] = op_ptr->key.bkt & kv.hash_table.bkt_mask; bkt_ptr[I] = &kv.hash_table.ht_index[bkt[I]]; __builtin_prefetch(bkt_ptr[I], 0, 0); tag[I] = op_ptr->key.tag; key_in_store[I] = 0; kv_ptr[I] = NULL; } for (int I = 0; I < op_num; I++) { spacetime_op_meta_t* op_ptr = (spacetime_op_meta_t*)&op_array[sizeof_op_elem * I]; if (hermes_skip_dispatcher(type, op_ptr, node_suspected)) continue; for (int j = 0; j < 8; j++) { if (bkt_ptr[I]->slots[j].in_use == 1 && bkt_ptr[I]->slots[j].tag == tag[I]) { uint64_t log_offset = bkt_ptr[I]->slots[j].offset & kv.hash_table.log_mask; // We can interpret the log entry as mica_op, even though it // may not contain the full MICA_MAX_VALUE value. kv_ptr[I] = (struct mica_op*)&kv.hash_table.ht_log[log_offset]; // Small values (1--64 bytes) can span 2 cache lines __builtin_prefetch(kv_ptr[I], 0, 0); __builtin_prefetch((uint8_t*)kv_ptr[I] + 64, 0, 0); // Detect if the head has wrapped around for this index entry if (kv.hash_table.log_head - bkt_ptr[I]->slots[j].offset >= kv.hash_table.log_cap) kv_ptr[I] = NULL; // If so, we mark it "not found" break; } } } for (int I = 0; I < op_num; I++) { spacetime_op_meta_t* op_ptr = (spacetime_op_meta_t*)&op_array[sizeof_op_elem * I]; if (hermes_skip_dispatcher(type, op_ptr, node_suspected)) continue; if (kv_ptr[I] != NULL) { // We had a tag match earlier. Now compare log entry. long long* key_ptr_log = (long long*)kv_ptr[I]; long long* key_ptr_req = (long long*)&op_ptr->key; if (key_ptr_log[1] == key_ptr_req[0]) { // Key Found 8 Byte keys key_in_store[I] = 1; hermes_exec_dispatcher(type, op_ptr, kv_ptr[I], curr_membership, (uint8_t)I, read_write_ops); } } if (key_in_store[I] == 0) // KVS miss --> We get here if either tag or log key match failed op_ptr->state = ST_MISS; } hermes_assertions_end_dispatcher(type, read_write_ops); } ================================================ FILE: src/hermes/hermes_worker.c ================================================ #include #include #include "../../include/utils/concur_ctrl.h" #include "inline-util.h" #include "util.h" /// #include "../../include/hades/hades.h" #include "../../include/wings/wings.h" /// int inv_skip_or_get_sender_id(uint8_t* req) { spacetime_op_t* op_req = (spacetime_op_t*)req; if (ENABLE_ASSERTIONS) { assert(is_response_code(op_req->op_meta.state) || is_bucket_state_code(op_req->op_meta.state)); assert(is_input_code(op_req->op_meta.opcode)); } if (op_req->op_meta.state != ST_PUT_SUCCESS && op_req->op_meta.state != ST_RMW_SUCCESS && op_req->op_meta.state != ST_REPLAY_SUCCESS && op_req->op_meta.state != ST_OP_MEMBERSHIP_CHANGE) return -1; return 0; // since inv is a bcast we can return any int other than -1 } void inv_modify_elem_after_send(uint8_t* req) { spacetime_op_t* op_req = (spacetime_op_t*)req; switch (op_req->op_meta.state) { case ST_PUT_SUCCESS: op_req->op_meta.state = ST_IN_PROGRESS_PUT; break; case ST_RMW_SUCCESS: op_req->op_meta.state = ST_IN_PROGRESS_RMW; break; case ST_REPLAY_SUCCESS: op_req->op_meta.state = ST_IN_PROGRESS_REPLAY; break; case ST_OP_MEMBERSHIP_CHANGE: op_req->op_meta.state = ST_OP_MEMBERSHIP_COMPLETE; break; default: assert(0); } } void inv_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req) { spacetime_op_t* op = (spacetime_op_t*)triggering_req; spacetime_inv_t* inv_to_send = (spacetime_inv_t*)msg_to_send; // Copy op to inv, set sender and opcode memcpy(inv_to_send, op, sizeof(spacetime_inv_t)); inv_to_send->op_meta.sender = (uint8_t)machine_id; inv_to_send->op_meta.opcode = ST_OP_INV; // //TODO change to include membership change // inv_to_send->op_meta.opcode = (uint8_t) (op->op_meta.state == // ST_OP_MEMBERSHIP_CHANGE ? // ST_OP_MEMBERSHIP_CHANGE : ST_OP_INV); } int ack_skip_or_get_sender_id(uint8_t* req) { spacetime_inv_t* inv_req = (spacetime_inv_t*)req; if (ENABLE_ASSERTIONS) assert(inv_req->op_meta.opcode == ST_INV_SUCCESS || inv_req->op_meta.opcode == ST_OP_INV_ABORT || inv_req->op_meta.opcode == ST_EMPTY); uint8_t is_small_msg = inv_req->op_meta.opcode == ST_INV_SUCCESS ? 1 : 0; return inv_req->op_meta.opcode == ST_EMPTY ? -1 : wings_set_sender_id_n_msg_type(inv_req->op_meta.sender, is_small_msg); } void ack_modify_elem_after_send(uint8_t* req) { spacetime_inv_t* inv_req = (spacetime_inv_t*)req; // empty inv buffer if (inv_req->op_meta.opcode == ST_INV_SUCCESS || inv_req->op_meta.opcode == ST_OP_INV_ABORT || inv_req->op_meta.opcode == ST_OP_MEMBERSHIP_CHANGE) inv_req->op_meta.opcode = ST_EMPTY; else assert(0); } void ack_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req) { spacetime_inv_t* inv_req = (spacetime_inv_t*)triggering_req; spacetime_ack_t* ack_to_send = (spacetime_ack_t*)msg_to_send; spacetime_inv_t* inv_to_send = (spacetime_inv_t*)msg_to_send; switch (inv_req->op_meta.opcode) { case ST_INV_SUCCESS: memcpy(ack_to_send, triggering_req, sizeof(spacetime_ack_t)); // copy req to next_req_ptr ack_to_send->sender = (uint8_t)machine_id; ack_to_send->opcode = ST_OP_ACK; break; case ST_OP_INV_ABORT: memcpy(inv_to_send, triggering_req, sizeof(spacetime_inv_t)); inv_to_send->op_meta.sender = (uint8_t)machine_id; inv_to_send->op_meta.opcode = ST_OP_INV_ABORT; break; default: assert(0); } } int val_skip_or_get_sender_id(uint8_t* req) { spacetime_ack_t* ack_req = (spacetime_ack_t*)req; if (ack_req->opcode == ST_ACK_SUCCESS || ack_req->opcode == ST_OP_MEMBERSHIP_CHANGE) { ack_req->opcode = ST_EMPTY; return -1; } else if (ack_req->opcode == ST_EMPTY) return -1; if (ENABLE_ASSERTIONS) assert(ack_req->opcode == ST_LAST_ACK_SUCCESS); return ack_req->sender; } void val_modify_elem_after_send(uint8_t* req) { spacetime_ack_t* ack_req = (spacetime_ack_t*)req; if (ENABLE_ASSERTIONS) assert(ack_req->opcode == ST_LAST_ACK_SUCCESS); ack_req->opcode = ST_EMPTY; } void val_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req) { spacetime_val_t* val_to_send = (spacetime_val_t*)msg_to_send; memcpy(val_to_send, triggering_req, sizeof(spacetime_val_t)); // copy req to next_req_ptr val_to_send->opcode = ST_OP_VAL; val_to_send->sender = (uint8_t)machine_id; } int memb_change_skip_or_get_sender_id(uint8_t* req) { spacetime_op_t* op_req = (spacetime_op_t*)req; if (op_req->op_meta.state != ST_PUT_COMPLETE_SEND_VALS && op_req->op_meta.state != ST_RMW_COMPLETE_SEND_VALS && op_req->op_meta.state != ST_REPLAY_COMPLETE_SEND_VALS) { return -1; } return 1; // it is bcast so just return something greater than zero } void memb_change_modify_elem_after_send(uint8_t* req) { spacetime_op_t* op_req = (spacetime_op_t*)req; switch (op_req->op_meta.state) { case ST_PUT_COMPLETE_SEND_VALS: op_req->op_meta.state = ST_PUT_COMPLETE; break; case ST_RMW_COMPLETE_SEND_VALS: op_req->op_meta.state = ST_RMW_COMPLETE; break; case ST_REPLAY_COMPLETE_SEND_VALS: op_req->op_meta.state = ST_NEW; // ST_REPLAY_COMPLETE; break; default: assert(0); } } void memb_change_copy_and_modify_elem(uint8_t* msg_to_send, uint8_t* triggering_req) { spacetime_op_t* op_req = (spacetime_op_t*)triggering_req; spacetime_val_t* val_to_send = (spacetime_val_t*)msg_to_send; val_to_send->opcode = ST_OP_VAL; val_to_send->sender = (uint8_t)machine_id; val_to_send->ts = op_req->op_meta.ts; } int rem_write_crd_skip_or_get_sender_id(uint8_t* req) { spacetime_val_t* val_ptr = (spacetime_val_t*)req; if (ENABLE_ASSERTIONS) assert(val_ptr->opcode == ST_VAL_SUCCESS || val_ptr->opcode == ST_EMPTY); return val_ptr->opcode == ST_EMPTY ? -1 : val_ptr->sender; } void rem_write_crd_modify_elem_after_send(uint8_t* req) { spacetime_val_t* val_req = (spacetime_val_t*)req; // empty inv buffer if (val_req->opcode == ST_VAL_SUCCESS) val_req->opcode = ST_EMPTY; else assert(0); } void print_total_send_recv_msgs(ud_channel_t* inv_ud_c, ud_channel_t* ack_ud_c, ud_channel_t* val_ud_c, ud_channel_t* crd_ud_c) { colored_printf( GREEN, "Total Send: invs %d, acks %d, vals %d, crds %d\n", inv_ud_c->stats.send_total_msgs, ack_ud_c->stats.send_total_msgs, val_ud_c->stats.send_total_msgs, crd_ud_c->stats.send_total_msgs); colored_printf( GREEN, "Total Recv: invs %d, acks %d, vals %d, crds %d\n", inv_ud_c->stats.recv_total_msgs, ack_ud_c->stats.recv_total_msgs, val_ud_c->stats.recv_total_msgs, crd_ud_c->stats.recv_total_msgs); } void spin_until_all_nodes_are_in_membership( spacetime_group_membership* last_group_membership, hades_wings_ctx_t* hw_ctx, uint16_t worker_lid) { bit_vector_t* membership_ptr = (bit_vector_t*)&last_group_membership->g_membership; bv_reset_all(membership_ptr); while (bv_no_setted_bits(*membership_ptr) < machine_num) { if (worker_lid == WORKER_WITH_FAILURE_DETECTOR) { update_view_and_issue_hbs(hw_ctx); if (!bv_are_equal(*membership_ptr, hw_ctx->ctx.curr_g_membership)) group_membership_update(hw_ctx->ctx); poll_for_remote_views(hw_ctx); } *last_group_membership = group_membership; } } static inline void failure_detection_n_membership(ud_channel_t** ud_channel_ptrs, bit_vector_t* last_membership, hades_wings_ctx_t* hw_ctx, uint16_t worker_lid) { if (worker_lid == WORKER_WITH_FAILURE_DETECTOR) { update_view_and_issue_hbs(hw_ctx); ///< TODO>: We need to fix recovery (RDMA side of wings)!! the following is ///< not fully correct /// Additionally, this handles only WORKER_WITH_FAILURE_DETECTOR thread /// instead of every thread if (!bv_are_equal(hw_ctx->ctx.last_local_view.view, hw_ctx->ctx.intermediate_local_view.view)) { for (int j = 0; j < 8; ++j) if (bv_bit_get(hw_ctx->ctx.last_local_view.view, j) == 0 && bv_bit_get(hw_ctx->ctx.intermediate_local_view.view, j) == 1) { printf("W[%d]: updates %d endpoint channels\n", worker_lid, j); for (int i = 0; i < TOTAL_WORKER_UD_QPs; ++i) { wings_reset_credits(ud_channel_ptrs[i], j); wings_reconfigure_wrs_ah(ud_channel_ptrs[i], j); } } } // if (!bv_are_equal(*last_membership, hw_ctx->ctx.curr_g_membership)) { group_membership_update(hw_ctx->ctx); } poll_for_remote_views(hw_ctx); } } void* run_worker(void* arg) { assert(is_CR == 0); struct thread_params params = *(struct thread_params*)arg; uint16_t worker_lid = (uint16_t)params.id; // Local ID of this worker thread uint16_t worker_gid = (uint16_t)(machine_id * num_workers + params.id); // Global ID of this worker thread /* -------------------------------------------------------- ------------------- RDMA WINGS DECLARATIONS--------------- ---------------------------------------------------------*/ ud_channel_t ud_channels[TOTAL_WORKER_N_FAILURE_DETECTION_UD_QPs]; ud_channel_t* ud_channel_ptrs[TOTAL_WORKER_N_FAILURE_DETECTION_UD_QPs]; ud_channel_t* inv_ud_c = &ud_channels[INV_UD_QP_ID]; ud_channel_t* ack_ud_c = &ud_channels[ACK_UD_QP_ID]; ud_channel_t* val_ud_c = &ud_channels[VAL_UD_QP_ID]; ud_channel_t* crd_ud_c = &ud_channels[CRD_UD_QP_ID]; for (int i = 0; i < TOTAL_WORKER_N_FAILURE_DETECTION_UD_QPs; ++i) ud_channel_ptrs[i] = &ud_channels[i]; const uint8_t is_bcast = 1; const uint8_t stats_on = 1; const uint8_t prints_on = 1; const uint8_t is_hdr_only = 0; const uint8_t expl_crd_ctrl = 0; const uint8_t disable_crd_ctrl = 0; char inv_qp_name[200], ack_qp_name[200], val_qp_name[200]; sprintf(inv_qp_name, "%s%d", "\033[31mINV\033[0m", worker_lid); sprintf(ack_qp_name, "%s%d", "\033[33mACK\033[0m", worker_lid); sprintf(val_qp_name, "%s%d", "\033[1m\033[32mVAL\033[0m", worker_lid); // WARNING: We use the ack channel to send/recv both acks and rmw-invs if RMWs // are enabled uint16_t ack_size = ENABLE_RMWs ? sizeof(spacetime_inv_t) : sizeof(spacetime_ack_t); uint8_t inv_inlining = (DISABLE_INLINING == 0 && max_coalesce * sizeof(spacetime_inv_t) < WINGS_MAX_SUPPORTED_INLINING) ? 1 : 0; uint8_t ack_inlining = (DISABLE_INLINING == 0 && max_coalesce * ack_size < WINGS_MAX_SUPPORTED_INLINING) ? 1 : 0; uint8_t val_inlining = (DISABLE_INLINING == 0 && max_coalesce * sizeof(spacetime_val_t) < WINGS_MAX_SUPPORTED_INLINING) ? 1 : 0; wings_ud_channel_init(inv_ud_c, inv_qp_name, REQ, (uint8_t)max_coalesce, sizeof(spacetime_inv_t), 0, inv_inlining, is_hdr_only, is_bcast, disable_crd_ctrl, expl_crd_ctrl, ack_ud_c, (uint8_t)credits_num, machine_num, (uint8_t)machine_id, stats_on, prints_on); wings_ud_channel_init(ack_ud_c, ack_qp_name, RESP, (uint8_t)max_coalesce, ack_size, sizeof(spacetime_ack_t), ack_inlining, is_hdr_only, 0, disable_crd_ctrl, expl_crd_ctrl, inv_ud_c, (uint8_t)credits_num, machine_num, (uint8_t)machine_id, stats_on, prints_on); wings_ud_channel_init(val_ud_c, val_qp_name, REQ, (uint8_t)max_coalesce, sizeof(spacetime_val_t), 0, val_inlining, is_hdr_only, is_bcast, disable_crd_ctrl, 1, crd_ud_c, (uint8_t)credits_num, machine_num, (uint8_t)machine_id, stats_on, prints_on); ///< HADES> Failure Detector Init hades_wings_ctx_t hw_ctx; uint16_t total_ud_qps = TOTAL_WORKER_UD_QPs; if (ENABLE_HADES_FAILURE_DETECTION && worker_lid == WORKER_WITH_FAILURE_DETECTOR) { total_ud_qps = TOTAL_WORKER_N_FAILURE_DETECTION_UD_QPs; ud_channel_t* hviews_c = &ud_channels[TOTAL_WORKER_UD_QPs]; ud_channel_t* hviews_crd_c = &ud_channels[TOTAL_WORKER_UD_QPs + 1]; const uint16_t max_views_to_poll = 10; const uint32_t send_view_every_us = 100; const uint32_t update_local_view_ms = 10; hades_wings_ctx_init(&hw_ctx, machine_id, machine_num, max_views_to_poll, send_view_every_us, update_local_view_ms, hviews_c, hviews_crd_c, worker_lid); } /// wings_setup_channel_qps_and_recvs(ud_channel_ptrs, total_ud_qps, g_share_qs_barrier, worker_lid); uint16_t ops_len = (uint16_t)(credits_num * remote_machine_num * max_coalesce); // credits * remote_machines * max_req_coalesce assert(ops_len >= inv_ud_c->recv_pkt_buff_len); assert(ops_len >= ack_ud_c->recv_pkt_buff_len); assert(ops_len >= val_ud_c->recv_pkt_buff_len); /* ------------------------------------------------------- ------------------- OTHER DECLARATIONS-------------------- ---------------------------------------------------------*/ // Intermediate buffs where reqs are copied from incoming_* buffs in order to // get passed to the KVS spacetime_op_t* ops; spacetime_inv_t* inv_recv_ops; spacetime_ack_t* ack_recv_ops; // WARNING!! This can be spacetime_ack_t / spacetime_inv_t // * depends if RMWs are disabled or not spacetime_val_t* val_recv_ops; setup_kvs_buffs(&ops, &inv_recv_ops, &ack_recv_ops, &val_recv_ops); struct spacetime_trace_command* trace; trace_init(&trace, worker_gid); //// spacetime_op_t* n_hottest_keys_in_ops_get[COALESCE_N_HOTTEST_KEYS]; spacetime_op_t* n_hottest_keys_in_ops_put[COALESCE_N_HOTTEST_KEYS]; for (int i = 0; i < COALESCE_N_HOTTEST_KEYS; ++i) { n_hottest_keys_in_ops_get[i] = NULL; n_hottest_keys_in_ops_put[i] = NULL; } //// int node_suspected = -1; uint32_t trace_iter = 0; uint16_t rolling_inv_index = 0; uint16_t invs_polled = 0, acks_polled = 0, vals_polled = 0; uint8_t has_outstanding_vals = 0, has_remaining_vals_from_memb_change = 0; uint32_t* num_of_iters_serving_op = malloc(max_batch_size * sizeof(uint32_t)); for (int i = 0; i < max_batch_size; ++i) num_of_iters_serving_op[i] = 0; /// Spawn stats thread if (worker_lid == 0) { if (spawn_stats_thread() != 0) colored_printf(RED, "Stats thread was not successfully spawned \n"); } struct timespec stopwatch_for_req_latency; // Membership init bit_vector_t* membership_ptr = ENABLE_HADES_FAILURE_DETECTION ? (bit_vector_t*)&group_membership.g_membership : NULL; if (ENABLE_HADES_FAILURE_DETECTION) { spin_until_all_nodes_are_in_membership(&group_membership, &hw_ctx, worker_lid); printf("~~~~~~~~~ Starting while ! ~~~~~~~~~\n"); } /* ----------------------------------------------------- ------------------------Main Loop-------------------- ----------------------------------------------------- */ struct timespec stopwatch_for_fd_warmup; get_rdtsc_timespec(&stopwatch_for_fd_warmup); uint8_t fd_warmup_time_has_passed = 0; while (true) { // Check something periodically (e.g., stats) if (unlikely(w_stats[worker_lid].total_loops % M_16 == 0)) { // print_total_send_recv_msgs_n_credits(&inv_ud_c, //&ack_ud_c, &val_ud_c, &crd_ud_c); } if (!ENABLE_HADES_FAILURE_DETECTION || fd_warmup_time_has_passed == 1) { node_suspected = refill_ops(&trace_iter, worker_lid, trace, ops, num_of_iters_serving_op, &stopwatch_for_req_latency, n_hottest_keys_in_ops_get, n_hottest_keys_in_ops_put); hermes_batch_ops_to_KVS(local_ops, (uint8_t*)ops, max_batch_size, sizeof(spacetime_op_t), group_membership, NULL, NULL, (uint8_t)worker_lid); stop_latency_of_completed_reads(ops, worker_lid, &stopwatch_for_req_latency); if (update_ratio > 0) { ///~~~~~~~~~~~~~~~~~~~~~~INVS~~~~~~~~~~~~~~~~~~~~~~~~~~~ wings_issue_pkts(inv_ud_c, membership_ptr, (uint8_t*)ops, (uint16_t)max_batch_size, sizeof(spacetime_op_t), &rolling_inv_index, inv_skip_or_get_sender_id, inv_modify_elem_after_send, inv_copy_and_modify_elem); /// Poll for INVs invs_polled = wings_poll_buff_and_post_recvs(inv_ud_c, ops_len, (uint8_t*)inv_recv_ops); if (invs_polled > 0) { hermes_batch_ops_to_KVS(invs, (uint8_t*)inv_recv_ops, invs_polled, sizeof(spacetime_inv_t), group_membership, &node_suspected, ops, (uint8_t)worker_lid); ///~~~~~~~~~~~~~~~~~~~~~~ACKS~~~~~~~~~~~~~~~~~~~~~~~~~~~ wings_issue_pkts( ack_ud_c, membership_ptr, (uint8_t*)inv_recv_ops, invs_polled, sizeof(spacetime_inv_t), NULL, ack_skip_or_get_sender_id, ack_modify_elem_after_send, ack_copy_and_modify_elem); if (ENABLE_ASSERTIONS) assert(inv_ud_c->stats.recv_total_msgs == ack_ud_c->stats.send_total_msgs); } if (has_outstanding_vals == 0 && has_remaining_vals_from_memb_change == 0) { /// Poll for Acks acks_polled = wings_poll_buff_and_post_recvs(ack_ud_c, ops_len, (uint8_t*)ack_recv_ops); if (acks_polled > 0) { hermes_batch_ops_to_KVS(acks, (uint8_t*)ack_recv_ops, acks_polled, ack_size, group_membership, NULL, ops, (uint8_t)worker_lid); stop_latency_of_completed_writes(ops, worker_lid, &stopwatch_for_req_latency); } } if (!DISABLE_VALS_FOR_DEBUGGING) { ///~~~~~~~~~~~~~~~~~~~~~~ VALs ~~~~~~~~~~~~~~~~~~~~~~~~~~~ if (has_remaining_vals_from_memb_change > 0) has_remaining_vals_from_memb_change = wings_issue_pkts( val_ud_c, membership_ptr, (uint8_t*)ops, max_batch_size, sizeof(spacetime_op_t), NULL, memb_change_skip_or_get_sender_id, memb_change_modify_elem_after_send, memb_change_copy_and_modify_elem); else has_outstanding_vals = wings_issue_pkts( val_ud_c, membership_ptr, (uint8_t*)ack_recv_ops, ack_ud_c->recv_pkt_buff_len, ack_size, NULL, val_skip_or_get_sender_id, val_modify_elem_after_send, val_copy_and_modify_elem); /// Poll for Vals vals_polled = wings_poll_buff_and_post_recvs(val_ud_c, ops_len, (uint8_t*)val_recv_ops); if (vals_polled > 0) { hermes_batch_ops_to_KVS(vals, (uint8_t*)val_recv_ops, vals_polled, sizeof(spacetime_val_t), group_membership, NULL, NULL, (uint8_t)worker_lid); ///~~~~~~~~~~~~~~~~~~~~~~CREDITS~~~~~~~~~~~~~~~~~~~~~~~~~~~ wings_issue_credits( crd_ud_c, membership_ptr, (uint8_t*)val_recv_ops, ops_len, sizeof(spacetime_val_t), rem_write_crd_skip_or_get_sender_id, rem_write_crd_modify_elem_after_send); } } } } else if (ENABLE_HADES_FAILURE_DETECTION && time_elapsed_in_sec(stopwatch_for_fd_warmup) > 2) { fd_warmup_time_has_passed = 1; printf("~~~~~~~~~ Starting execution! ~~~~~~~~~\n"); } // Failure Detection and Membership if (ENABLE_HADES_FAILURE_DETECTION) { failure_detection_n_membership(ud_channel_ptrs, membership_ptr, &hw_ctx, worker_lid); if (group_membership_has_changed(&group_membership, worker_lid)) { /// Complete inprogress updates/replays waiting for ACKS only from /// failed nodes hermes_batch_ops_to_KVS(local_ops_after_membership_change, (uint8_t*)ops, max_batch_size, sizeof(spacetime_op_t), group_membership, NULL, NULL, (uint8_t)worker_lid); stop_latency_of_completed_writes(ops, worker_lid, &stopwatch_for_req_latency); if (!DISABLE_VALS_FOR_DEBUGGING) /// Bcast VAL msgs for those completed update/replays has_remaining_vals_from_memb_change = wings_issue_pkts( val_ud_c, membership_ptr, (uint8_t*)ops, max_batch_size, sizeof(spacetime_op_t), NULL, memb_change_skip_or_get_sender_id, memb_change_modify_elem_after_send, memb_change_copy_and_modify_elem); } } w_stats[worker_lid].total_loops++; } } ================================================ FILE: src/hermes/main.c ================================================ #define _GNU_SOURCE #include #include #include #include #include #include "../../include/utils/bit_vector.h" #include "../../include/utils/concur_ctrl.h" #include "../../include/wings/wings_api.h" #include "config.h" #include "hrd.h" #include "spacetime.h" #include "util.h" // Global vars struct latency_counters latency_count; volatile struct worker_stats w_stats[MAX_WORKERS_PER_MACHINE]; dbit_vector_t* g_share_qs_barrier; spacetime_group_membership group_membership; // Global config vars uint8_t is_CR; int num_workers; int update_ratio; int rmw_ratio; int credits_num; int max_coalesce; int max_batch_size; // for batches to KVS int machine_num; int remote_machine_num; int worker_measuring_latency; // This is required only when Hades failure detection is disabled void group_membership_init(void) { group_membership.num_of_alive_remotes = remote_machine_num; seqlock_init(&group_membership.lock); bv_init((bit_vector_t*)&group_membership.g_membership); for (uint8_t i = 0; i < machine_num; ++i) bv_bit_set((bit_vector_t*)&group_membership.g_membership, i); bv_copy((bit_vector_t*)&group_membership.w_ack_init, group_membership.g_membership); bv_reverse((bit_vector_t*)&group_membership.w_ack_init); bv_bit_set((bit_vector_t*)&group_membership.w_ack_init, (uint8_t)machine_id); } int main(int argc, char* argv[]) { int i, c; is_roce = -1; machine_id = -1; // config vars is_CR = 1; num_workers = -1; update_ratio = -1; rmw_ratio = -1; credits_num = -1; max_coalesce = -1; max_batch_size = -1; remote_IP = (char*)malloc(16 * sizeof(char)); machine_num = -1; remote_machine_num = -1; worker_measuring_latency = -1; // green_printf("UD size: %d ibv_grh + crd size: %d \n", // sizeof(ud_req_crd_t), sizeof(struct ibv_grh) + sizeof(spacetime_crd_t)); // static_assert(sizeof(ud_req_crd_t) == sizeof(struct ibv_grh) + // sizeof(spacetime_crd_t), ""); ///CRD --> 48 Bytes instead of 43 struct thread_params* param_arr; pthread_t* thread_arr; static struct option opts[] = { {.name = "machine-id", .has_arg = 1, .val = 'm'}, {.name = "lat-worker", .has_arg = 1, .val = 'l'}, {.name = "is-roce", .has_arg = 1, .val = 'r'}, {.name = "rmw-ratio", .has_arg = 1, .val = 'R'}, {.name = "dev-name", .has_arg = 1, .val = 'd'}, {.name = "write-ratio", .has_arg = 1, .val = 'w'}, {.name = "num-workers", .has_arg = 1, .val = 'W'}, {.name = "num-machines", .has_arg = 1, .val = 'M'}, {.name = "credits", .has_arg = 1, .val = 'c'}, {.name = "max-coalesce", .has_arg = 1, .val = 'C'}, {.name = "max-batch-size", .has_arg = 1, .val = 'b'}, {.name = "hermes", .has_arg = 0, .val = 'H'}, {0}}; /* Parse and check arguments */ while (1) { c = getopt_long(argc, argv, "m:r:l:R:d:w:c:C:W:M:H", opts, NULL); if (c == -1) break; switch (c) { case 'm': machine_id = atoi(optarg); break; case 'r': is_roce = atoi(optarg); break; case 'l': worker_measuring_latency = atoi(optarg); break; case 'd': memcpy(dev_name, optarg, strlen(optarg)); break; // Config vars case 'w': update_ratio = atoi(optarg); break; case 'R': rmw_ratio = atoi(optarg); break; case 'W': num_workers = atoi(optarg); break; case 'c': credits_num = atoi(optarg); break; case 'C': max_coalesce = atoi(optarg); break; case 'b': max_batch_size = atoi(optarg); break; case 'H': is_CR = 0; break; case 'M': machine_num = atoi(optarg); remote_machine_num = machine_num - 1; break; default: printf("Invalid argument %d\n", c); assert(false); } } // If arguments not passed use the default values from header file if (update_ratio == -1) update_ratio = DEFAULT_UPDATE_RATIO; if (rmw_ratio == -1) rmw_ratio = ENABLE_RMWs ? DEFAULT_RMW_RATIO : 0; if (num_workers == -1) num_workers = DEFAULT_WORKERS_PER_MACHINE; if (max_coalesce == -1) max_coalesce = MAX_REQ_COALESCE; if (max_batch_size == -1) max_batch_size = MAX_BATCH_KVS_OPS_SIZE; if (credits_num == -1) credits_num = is_CR ? MAX_CREDITS_PER_REMOTE_WORKER_CR : MAX_CREDITS_PER_REMOTE_WORKER; if (worker_measuring_latency == -1 && DEFAULT_MEASURE_LATENCY) worker_measuring_latency = DEFAULT_WORKER_MEASURING_LATENCY; if (machine_num == -1) { machine_num = MAX_MACHINE_NUM; remote_machine_num = MAX_REMOTE_MACHINES; } assert(ENABLE_RMWs || rmw_ratio == 0); assert(rmw_ratio != 0 || ENABLE_RMWs == 0); // WARNING: Some structs are statically allocated using // MAX_WORKERS_PER_MACHINE / MAX_BATCH_KVS_OPS_SIZE assert(num_workers <= MAX_WORKERS_PER_MACHINE); assert(max_batch_size <= MAX_BATCH_KVS_OPS_SIZE); assert(machine_num > 1 && machine_num <= MAX_MACHINE_NUM); assert(worker_measuring_latency == -1 || worker_measuring_latency < num_workers); assert(!ENABLE_VIRTUAL_NODE_IDS || VIRTUAL_NODE_IDS_PER_NODE > machine_num); assert(!ENABLE_VIRTUAL_NODE_IDS || machine_num * VIRTUAL_NODE_IDS_PER_NODE < 255); if (num_workers > 1) dbv_init(&g_share_qs_barrier, (uint8_t)num_workers); else g_share_qs_barrier = NULL; printf( "update rate: %d (RMW rate %d) | workers %d | batch size %d| CREDITS %d " "| coalesce %d |\n", update_ratio, rmw_ratio, num_workers, max_batch_size, credits_num, max_coalesce); thread_arr = malloc(num_workers * sizeof(pthread_t)); param_arr = malloc(num_workers * sizeof(struct thread_params)); pthread_attr_t attr; cpu_set_t cpus_w; group_membership_init(); init_stats((void*)w_stats); spacetime_init(machine_id); pthread_attr_init(&attr); int w_core, init_core = SOCKET_TO_START_SPAWNING_THREADS; for (i = 0; i < num_workers; i++) { if (USE_ALL_SOCKETS && ENABLE_HYPERTHREADING) w_core = init_core + i; else w_core = 2 * i + init_core; assert(w_core < TOTAL_HW_CORES); assert(ENABLE_HYPERTHREADING || w_core < TOTAL_NUMBER_OF_SOCKETS * TOTAL_CORES_PER_SOCKET); param_arr[i].id = i; CPU_ZERO(&cpus_w); CPU_SET(w_core, &cpus_w); pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpus_w); pthread_create(&thread_arr[i], &attr, run_worker, ¶m_arr[i]); } colored_printf(YELLOW, "Sizes: {Op: %d, Object Meta %d, Value %d},\n", sizeof(spacetime_op_t), sizeof(spacetime_object_meta), ST_VALUE_SIZE); colored_printf(YELLOW, "Coherence msg Sizes: {Inv: %d, Ack: %d, Val: %d}\n", sizeof(spacetime_inv_t), sizeof(spacetime_ack_t), sizeof(spacetime_val_t)); colored_printf( YELLOW, "Max Coalesce Packet Sizes: {Inv: %d, Ack: %d, Val: %d}\n", sizeof(wings_ud_send_pkt_t) + max_coalesce * sizeof(spacetime_inv_t), sizeof(wings_ud_send_pkt_t) + max_coalesce * sizeof(spacetime_ack_t), sizeof(wings_ud_send_pkt_t) + max_coalesce * sizeof(spacetime_val_t)); for (i = 0; i < num_workers; i++) pthread_join(thread_arr[i], NULL); return 0; } ////////////////////////////////////////////////////////////////////////////////// /// Static asserts to ensure only correct configs ////////////////////////////////////////////////////////////////////////////////// static_assert(MICA_MAX_VALUE >= ST_VALUE_SIZE, ""); static_assert(MAX_MACHINE_NUM <= 8, ""); // TODO haven't test bit vectors with more than 8 nodes static_assert(MAX_MACHINE_NUM <= GROUP_MEMBERSHIP_ARRAY_SIZE * 8, ""); // bit vector for acks / group membership static_assert(MAX_MACHINE_NUM <= 255, ""); static_assert(KV_SOCKET < TOTAL_NUMBER_OF_SOCKETS && SOCKET_TO_START_SPAWNING_THREADS < TOTAL_NUMBER_OF_SOCKETS, ""); static_assert((ENABLE_HYPERTHREADING == 1 && USE_ALL_SOCKETS == 1) || MAX_WORKERS_PER_MACHINE <= TOTAL_CORES_PER_SOCKET, ""); static_assert(MAX_WORKERS_PER_MACHINE <= TOTAL_HW_CORES, ""); /// Assertions for failures static_assert(FAKE_FAILURE == 0 || NODE_TO_FAIL < MAX_MACHINE_NUM, ""); static_assert(FAKE_FAILURE == 0 || ROUNDS_BEFORE_FAILURE < PRINT_NUM_STATS_BEFORE_EXITING, ""); static_assert(FAKE_FAILURE == 0 || WORKER_WITH_FAILURE_DETECTOR < MAX_WORKERS_PER_MACHINE, ""); static_assert(MAX_MACHINE_NUM < TIE_BREAKER_ID_EMPTY, ""); static_assert(MAX_MACHINE_NUM < LAST_WRITER_ID_EMPTY, ""); static_assert(MAX_BATCH_KVS_OPS_SIZE < ST_OP_BUFFER_INDEX_EMPTY, ""); /// 1B write_buffer_index and 255 is used as "empty" value /// Make sure that assigned numbers to States are monotonically increasing with /// the following order static_assert(VALID_STATE < INVALID_STATE, ""); static_assert(INVALID_STATE < INVALID_WRITE_STATE, ""); static_assert(INVALID_WRITE_STATE < WRITE_STATE, ""); static_assert(WRITE_STATE < REPLAY_STATE, ""); static_assert(ENABLE_RMWs == 0 || ENABLE_RMWs == 1, ""); ================================================ FILE: src/hermes/spacetime.c ================================================ // // Created by akatsarakis on 04/05/18. // #include #include #include #include #include "../../include/utils/concur_ctrl.h" /* * Initialize the spacetime using a Mica instances and adding the timestamps * and locks to the keys of mica-herd-herd structure */ struct spacetime_kv kv; void spacetime_object_meta_init(spacetime_object_meta* ol) { cctrl_init(&ol->cctrl); ol->state = VALID_STATE; ol->last_writer_id = LAST_WRITER_ID_EMPTY; ol->op_buffer_index = ST_OP_BUFFER_INDEX_EMPTY; } void spacetime_init(int instance_id) { // TODO may add kvs stats mica_init(&kv.hash_table, instance_id, KV_SOCKET, SPACETIME_NUM_BKTS, SPACETIME_LOG_CAP); spacetime_populate_fixed_len(&kv, SPACETIME_NUM_KEYS, KVS_VALUE_SIZE); } void spacetime_populate_fixed_len(struct spacetime_kv* _kv, int n, int val_len) { assert(n > 0); assert(val_len > 0 && val_len <= KVS_VALUE_SIZE); /* This is needed for the eviction message below to make sense */ assert(_kv->hash_table.num_insert_op == 0 && _kv->hash_table.num_index_evictions == 0); struct mica_op op; struct mica_resp resp; unsigned long long* op_key = (unsigned long long*)&op.key; spacetime_object_meta initial_meta; spacetime_object_meta_init(&initial_meta); /* Generate the keys to insert */ uint128* key_arr = mica_gen_keys(n); op.val_len = (uint8_t)(val_len >> SHIFT_BITS); op.opcode = ST_OP_PUT; spacetime_object_meta* value_ptr = (spacetime_object_meta*)op.value; memcpy((void*)value_ptr, (void*)&initial_meta, sizeof(spacetime_object_meta)); for (int i = n - 1; i >= 0; i--) { op_key[0] = key_arr[i].first; op_key[1] = key_arr[i].second; /// printf("Key Metadata: Lock(%u), State(%u), Counter(%u:%u)\n", /// op.key.meta.lock, /// op.key.meta.state, op.key.meta.version, op.key.meta.cid); uint8_t val = (uint8_t)('a' + (i % 20)); memset((void*)&value_ptr[1], val, ST_VALUE_SIZE); mica_insert_one(&_kv->hash_table, &op, &resp); } assert(_kv->hash_table.num_insert_op == n); colored_printf(YELLOW, "Spacetime: Populated instance %d with %d keys, length = %d. " "Index eviction fraction = %.4f.\n", _kv->hash_table.instance_id, n, val_len, (double)_kv->hash_table.num_index_evictions / _kv->hash_table.num_insert_op); } ================================================ FILE: src/hermes/stats.c ================================================ #include "util.h" static inline void xput_file_name(char* filename) { char* path = "./results/xput/per-node"; sprintf(filename, "%s/%s_xPut_m_%d_wr_%.1f_rmw_%.1f_wk_%d_b_%d_c_%d%s-%d.txt", path, is_CR == 1 ? "CR" : "Hermes", machine_num, update_ratio / 10.0, rmw_ratio / 10.0, num_workers, max_batch_size, credits_num, FEED_FROM_TRACE == 1 ? "_a_0.99" : "_uni", machine_id); } // assuming microsecond latency void dump_xput_stats(double xput_in_miops) { static uint8_t no_func_calls = 0; /// WARNING this is not thread safe. assert(no_func_calls < 250); FILE* xput_stats_fd; char filename[128]; xput_file_name(filename); const char* open_mode = no_func_calls == 0 ? "w" : "a"; xput_stats_fd = fopen(filename, open_mode); fprintf(xput_stats_fd, "node%d_miops-%d: %.2f\n", machine_id, no_func_calls, xput_in_miops); fclose(xput_stats_fd); no_func_calls++; // printf("xPut stats saved at %s\n", filename); } // assuming microsecond latency void dump_latency_stats(void) { FILE* latency_stats_fd; char filename[128]; char* path = "./results/latency"; sprintf(filename, "%s/%s_latency_m_%d_w_%d_b_%d_wr_%d_rmw_%d_c_%d%s.csv", path, is_CR == 1 ? "CR" : "Hermes", machine_num, num_workers, max_batch_size, update_ratio, rmw_ratio, credits_num, FEED_FROM_TRACE == 1 ? "_a_0.99" : ""); latency_stats_fd = fopen(filename, "w"); fprintf(latency_stats_fd, "#---------------- Read Reqs --------------\n"); for (int i = 0; i < LATENCY_BUCKETS; ++i) fprintf(latency_stats_fd, "reads: %d, %d\n", i * LATENCY_PRECISION, latency_count.read_reqs[i]); fprintf(latency_stats_fd, "reads: -1, %d\n", latency_count.read_reqs[LATENCY_BUCKETS]); // print outliers fprintf(latency_stats_fd, "reads-hl: %d\n", latency_count.max_read_latency); // print max read latency fprintf(latency_stats_fd, "#---------------- Write Reqs ---------------\n"); for (int i = 0; i < LATENCY_BUCKETS; ++i) fprintf(latency_stats_fd, "writes: %d, %d\n", i * LATENCY_PRECISION, latency_count.write_reqs[i]); fprintf(latency_stats_fd, "writes: -1, %d\n", latency_count.write_reqs[LATENCY_BUCKETS]); // print outliers fprintf(latency_stats_fd, "writes-hl: %d\n", latency_count.max_write_latency); // print max write latency fclose(latency_stats_fd); printf("Latency stats saved at %s\n", filename); } static inline double safe_division(double a, double b) { return b == 0 ? 0 : a / b; } void* print_stats_thread(void* no_arg) { uint16_t i, print_count = 0; long long all_worker_xput = 0; long long all_worker_wrs = 0; long long all_worker_rmws = 0; long long all_worker_aborted_rmws = 0; double total_throughput = 0; double total_rd_throughput = 0; double total_rmw_aborts = 0; double total_wr_throughput = 0; double total_rmw_throughput = 0; // int sleep_time = 20; struct worker_stats curr_w_stats[MAX_WORKERS_PER_MACHINE], prev_w_stats[MAX_WORKERS_PER_MACHINE]; struct stats all_stats; sleep(4); memcpy(prev_w_stats, (void*)w_stats, MAX_WORKERS_PER_MACHINE * (sizeof(struct worker_stats))); struct timespec start, end; clock_gettime(CLOCK_REALTIME, &start); while (true) { usleep(PRINT_STATS_EVERY_MSECS * 1000); clock_gettime(CLOCK_REALTIME, &end); double seconds = (end.tv_sec - start.tv_sec) + (double)(end.tv_nsec - start.tv_nsec) / 1000000001; start = end; memcpy(curr_w_stats, (void*)w_stats, MAX_WORKERS_PER_MACHINE * (sizeof(struct worker_stats))); all_worker_xput = 0; all_worker_wrs = 0; all_worker_rmws = 0; all_worker_aborted_rmws = 0; print_count++; if (FAKE_FAILURE == 1 && machine_id == NODE_TO_FAIL && print_count == ROUNDS_BEFORE_FAILURE) { colored_printf(RED, "---------------------------------------\n"); colored_printf(RED, "------------ NODE FAILED ------------\n"); colored_printf(RED, "---------------------------------------\n"); exit(0); } if (EXIT_ON_STATS_PRINT == 1 && print_count == PRINT_NUM_STATS_BEFORE_EXITING) { if (worker_measuring_latency != -1 && machine_id == 0) dump_latency_stats(); if (DUMP_XPUT_STATS_TO_FILE) { char filename[128]; xput_file_name(filename); printf("xPut stats (of this node) saved at %s\n", filename); } printf("---------------------------------------\n"); printf("------------ RUN FINISHED -------------\n"); printf("---------------------------------------\n"); exit(0); } seconds *= MILLION; // compute only MIOPS for (i = 0; i < num_workers; i++) { all_worker_xput += curr_w_stats[i].completed_ops_per_worker - prev_w_stats[i].completed_ops_per_worker; all_worker_wrs += curr_w_stats[i].completed_wrs_per_worker - prev_w_stats[i].completed_wrs_per_worker; all_worker_rmws += curr_w_stats[i].completed_rmws_per_worker - prev_w_stats[i].completed_rmws_per_worker; all_worker_aborted_rmws += curr_w_stats[i].aborted_rmws_per_worker - prev_w_stats[i].aborted_rmws_per_worker; all_stats.xput_per_worker[i] = (curr_w_stats[i].completed_ops_per_worker - prev_w_stats[i].completed_ops_per_worker) / seconds; all_stats.rmw_xput_per_worker[i] = (curr_w_stats[i].completed_rmws_per_worker - prev_w_stats[i].completed_rmws_per_worker) / seconds; all_stats.rmw_abort_rate_per_worker[i] = safe_division((curr_w_stats[i].aborted_rmws_per_worker - prev_w_stats[i].aborted_rmws_per_worker), (curr_w_stats[i].completed_rmws_per_worker - prev_w_stats[i].completed_rmws_per_worker)); } memcpy(prev_w_stats, curr_w_stats, MAX_WORKERS_PER_MACHINE * (sizeof(struct worker_stats))); total_throughput = all_worker_xput / seconds; total_wr_throughput = all_worker_wrs / seconds; total_rmw_throughput = all_worker_rmws / seconds; total_rmw_aborts = safe_division(all_worker_aborted_rmws, all_worker_rmws); total_rd_throughput = total_throughput - total_wr_throughput - total_rmw_throughput; printf("---------------PRINT %d time elapsed %.2f---------------\n", print_count, seconds / MILLION); colored_printf(GREEN, "NODE MReqs/s: %.2f \n(Rd|Wr|RMW: %.2f|%.2f|%.2f) | RMW " "aborts: %.2f%%)\n", total_throughput, total_rd_throughput, total_wr_throughput, total_rmw_throughput, 100 * total_rmw_aborts); if (PRINT_WORKER_STATS) { for (i = 0; i < num_workers; i++) { // yellow_printf("W%d: %.2f MIOPS-Batch %.2f(%.2f) -H %.2f -W // %llu -E %.2f -AC %.2f \n", i, // all_stats.xput_per_worker[i], // all_stats.batch_size_per_worker[i], // all_stats.stalled_time_per_worker[i], // trace_ratio, curr_w_stats[i].wasted_loops, // all_stats.empty_reqs_per_worker[i], // all_stats.average_coalescing_per_worker[i]); all_stats.issued_invs_avg_coalesing[i] = w_stats[i].issued_invs_per_worker / (double)w_stats[i].issued_packet_invs_per_worker; all_stats.issued_acks_avg_coalesing[i] = w_stats[i].issued_acks_per_worker / (double)w_stats[i].issued_packet_acks_per_worker; all_stats.issued_vals_avg_coalesing[i] = w_stats[i].issued_vals_per_worker / (double)w_stats[i].issued_packet_vals_per_worker; all_stats.issued_crds_avg_coalesing[i] = w_stats[i].issued_crds_per_worker / (double)w_stats[i].issued_packet_crds_per_worker; all_stats.received_invs_avg_coalesing[i] = w_stats[i].received_invs_per_worker / (double)w_stats[i].received_packet_invs_per_worker; all_stats.received_acks_avg_coalesing[i] = w_stats[i].received_acks_per_worker / (double)w_stats[i].received_packet_acks_per_worker; all_stats.received_vals_avg_coalesing[i] = w_stats[i].received_vals_per_worker / (double)w_stats[i].received_packet_vals_per_worker; all_stats.received_crds_avg_coalesing[i] = w_stats[i].received_crds_per_worker / (double)w_stats[i].received_packet_crds_per_worker; all_stats.percentage_of_wasted_loops[i] = w_stats[i].wasted_loops / (double)w_stats[i].total_loops * 100; all_stats.completed_reqs_per_loop[i] = curr_w_stats[i].completed_ops_per_worker / (double)w_stats[i].total_loops; colored_printf(CYAN, "W%d: ", i); colored_printf(YELLOW, "%.2f MIOPS, Coalescing{Inv: %.2f, Ack: %.2f, Val: " "%.2f, Crd: %.2f}\n", all_stats.xput_per_worker[i], all_stats.issued_invs_avg_coalesing[i], all_stats.issued_acks_avg_coalesing[i], all_stats.issued_vals_avg_coalesing[i], all_stats.issued_crds_avg_coalesing[i]); colored_printf(YELLOW, "\t wasted_loops: %.2f%, reqs per loop: %.2f, total " "reqs %d, reqs missed: %d\n", all_stats.percentage_of_wasted_loops[i], all_stats.completed_reqs_per_loop[i], curr_w_stats[i].completed_ops_per_worker, curr_w_stats[i].reqs_missed_in_kvs); } colored_printf(GREEN, "NODE MReqs/s: %.2f \n", total_throughput); printf("---------------------------------------\n"); } if (DUMP_XPUT_STATS_TO_FILE) dump_xput_stats(total_throughput); } } ================================================ FILE: src/hermes/util.c ================================================ // // Created by akatsarakis on 15/03/18. // #define _GNU_SOURCE #include "util.h" #include "hrd.h" #include "inline-util.h" #include "spacetime.h" int spawn_stats_thread(void) { pthread_t* thread_arr = malloc(sizeof(pthread_t)); pthread_attr_t attr; cpu_set_t cpus_stats; pthread_attr_init(&attr); CPU_ZERO(&cpus_stats); if (DEFAULT_THREAD_OF_STAT_THREAD != -1) { CPU_SET(DEFAULT_THREAD_OF_STAT_THREAD, &cpus_stats); } else { if (MAX_WORKERS_PER_MACHINE > 17) CPU_SET(39, &cpus_stats); else CPU_SET(2 * MAX_WORKERS_PER_MACHINE + 2, &cpus_stats); } pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpus_stats); return pthread_create(&thread_arr[0], &attr, print_stats_thread, NULL); } uint8_t is_state_code(uint8_t code) { switch (code) { // Object States case VALID_STATE: case WRITE_STATE: case REPLAY_STATE: case INVALID_STATE: case INVALID_WRITE_STATE: return 1; default: return 0; } } uint8_t is_input_code(uint8_t code) { switch (code) { // Input opcodes case ST_OP_GET: case ST_OP_PUT: case ST_OP_RMW: case ST_OP_INV: case ST_OP_ACK: case ST_OP_VAL: case ST_OP_CRD: case ST_OP_MEMBERSHIP_CHANGE: case ST_OP_MEMBERSHIP_COMPLETE: return 1; default: return 0; } } uint8_t is_response_code(uint8_t code) { switch (code) { case ST_GET_COMPLETE: case ST_PUT_SUCCESS: case ST_PUT_COMPLETE: case ST_REPLAY_SUCCESS: case ST_REPLAY_COMPLETE: case ST_INV_SUCCESS: case ST_ACK_SUCCESS: case ST_VAL_SUCCESS: case ST_LAST_ACK_SUCCESS: case ST_LAST_ACK_NO_BCAST_SUCCESS: case ST_MISS: case ST_GET_STALL: case ST_PUT_STALL: case ST_PUT_COMPLETE_SEND_VALS: case ST_INV_OUT_OF_GROUP: // RMW case ST_RMW_ABORT: case ST_RMW_STALL: case ST_RMW_SUCCESS: case ST_RMW_COMPLETE: return 1; default: return 0; } } uint8_t is_bucket_state_code(uint8_t code) { switch (code) { case ST_NEW: case ST_EMPTY: case ST_COMPLETE: case ST_IN_PROGRESS_GET: case ST_IN_PROGRESS_PUT: case ST_IN_PROGRESS_RMW: case ST_IN_PROGRESS_REPLAY: return 1; default: return 0; } } char* code_to_str(uint8_t code) { switch (code) { // Object States case VALID_STATE: return "VALID_STATE"; case INVALID_STATE: return "INVALID_STATE"; case INVALID_WRITE_STATE: return "INVALID_WRITE_STATE"; case WRITE_STATE: return "WRITE_STATE"; case REPLAY_STATE: return "REPLAY_STATE"; // Input opcodes case ST_OP_GET: return "ST_OP_GET"; case ST_OP_PUT: return "ST_OP_PUT"; case ST_OP_RMW: return "ST_OP_RMW"; case ST_OP_INV: return "ST_OP_INV"; case ST_OP_INV_ABORT: return "ST_OP_INV_ABORT"; case ST_OP_ACK: return "ST_OP_ACK"; case ST_OP_VAL: return "ST_OP_VAL"; case ST_OP_CRD: return "ST_OP_CRD"; case ST_OP_MEMBERSHIP_CHANGE: return "ST_OP_MEMBERSHIP_CHANGE"; case ST_OP_MEMBERSHIP_COMPLETE: return "ST_OP_MEMBERSHIP_COMPLETE"; // Response opcodes case ST_GET_COMPLETE: return "ST_GET_COMPLETE"; case ST_PUT_SUCCESS: return "ST_PUT_SUCCESS"; case ST_PUT_COMPLETE: return "ST_PUT_COMPLETE"; case ST_RMW_SUCCESS: return "ST_RMW_SUCCESS"; case ST_RMW_COMPLETE: return "ST_RMW_COMPLETE"; case ST_REPLAY_SUCCESS: return "ST_REPLAY_SUCCESS"; case ST_REPLAY_COMPLETE: return "ST_REPLAY_COMPLETE"; case ST_INV_SUCCESS: return "ST_INV_SUCCESS"; case ST_ACK_SUCCESS: return "ST_ACK_SUCCESS"; case ST_VAL_SUCCESS: return "ST_VAL_SUCCESS"; case ST_LAST_ACK_SUCCESS: return "ST_LAST_ACK_SUCCESS"; case ST_LAST_ACK_NO_BCAST_SUCCESS: return "ST_LAST_ACK_NO_BCAST_SUCCESS"; case ST_MISS: return "\033[31mST_MISS\033[0m"; case ST_GET_STALL: return "ST_GET_STALL"; case ST_PUT_STALL: return "ST_PUT_STALL"; case ST_RMW_STALL: return "ST_RMW_STALL"; case ST_RMW_ABORT: return "ST_RMW_ABORT"; case ST_PUT_COMPLETE_SEND_VALS: return "ST_PUT_COMPLETE_SEND_VALS"; case ST_RMW_COMPLETE_SEND_VALS: return "ST_RMW_COMPLETE_SEND_VALS"; case ST_REPLAY_COMPLETE_SEND_VALS: return "ST_REPLAY_COMPLETE_SEND_VALS"; case ST_INV_OUT_OF_GROUP: return "ST_INV_OUT_OF_GROUP"; case ST_SEND_CRD: return "ST_SEND_CRD"; // Ops bucket states case ST_EMPTY: return "ST_EMPTY"; case ST_NEW: return "ST_NEW"; case ST_IN_PROGRESS_PUT: return "ST_IN_PROGRESS_PUT"; case ST_IN_PROGRESS_RMW: return "ST_IN_PROGRESS_RMW"; case ST_IN_PROGRESS_REPLAY: return "ST_IN_PROGRESS_REPLAY"; case ST_COMPLETE: return "ST_COMPLETE"; // Buffer Types case ST_INV_BUFF: return "ST_INV_BUFF"; case ST_ACK_BUFF: return "ST_ACK_BUFF"; case ST_VAL_BUFF: return "ST_VAL_BUFF"; case ST_CRD_BUFF: return "ST_CRD_BUFF"; case NOP: return "NOP"; // Failure related case ST_OP_HEARTBEAT: return "ST_OP_HEARTBEAT"; case ST_OP_SUSPICION: return "ST_OP_SUSPICION"; default: { printf("Wrong code (%d)\n", code); assert(0); } } } // Creates a trace with a uniform distribution without a backing file void create_uni_trace(struct spacetime_trace_command** cmds, int worker_gid) { srand(time(NULL) + worker_gid * 7); *cmds = malloc((NUM_OF_REP_REQS + 1) * sizeof(struct spacetime_trace_command)); int rmws = 0; uint32_t i, writes = 0; // parse file line by line and insert trace to cmd. for (i = 0; i < NUM_OF_REP_REQS; i++) { // Before reading the request deside if it's gone be read or write (*cmds)[i].opcode = (uint8_t)(update_ratio == 1000 || ((rand() % 1000 < update_ratio)) ? ST_OP_PUT : ST_OP_GET); if (ENABLE_RMWs && (*cmds)[i].opcode == ST_OP_PUT) (*cmds)[i].opcode = (uint8_t)(rmw_ratio == 1000 || ((rand() % 1000 < rmw_ratio)) ? ST_OP_RMW : ST_OP_PUT); if ((*cmds)[i].opcode == ST_OP_RMW) rmws++; if ((*cmds)[i].opcode == ST_OP_PUT) writes++; //--- KEY ID---------- uint32 key_id = KEY_NUM != 0 ? (uint32)rand() % KEY_NUM : (uint32)rand() % SPACETIME_NUM_KEYS; if (USE_A_SINGLE_KEY == 1) key_id = 0; uint128 key_hash = CityHash128((char*)&(key_id), 4); // memcpy(&(*cmds)[i].key_hash, &key_hash, 16); // this is for 16B // keys memcpy(&(*cmds)[i].key_hash, &((uint64_t*)&key_hash)[1], 8); (*cmds)[i].key_id = (uint8_t)(key_id < 255 ? key_id : ST_KEY_ID_255_OR_HIGHER); } if (worker_gid % num_workers == 0) printf( "Update Ratio: %.2f%% (Writes|RMWs: %.2f%%|%.2f%%)\n" "Trace w_size %d \n", (double)((writes + rmws) * 100) / NUM_OF_REP_REQS, (double)(writes * 100) / NUM_OF_REP_REQS, (double)(rmws * 100) / NUM_OF_REP_REQS, NUM_OF_REP_REQS); (*cmds)[NUM_OF_REP_REQS].opcode = NOP; // printf("CLient %d Trace w_size: %d, debug counter %d hot keys %d, cold keys // %d \n",l_id, cmd_count, debug_cnt, // t_stats[l_id].hot_keys_per_trace, t_stats[l_id].cold_keys_per_trace // ); } // Parse a trace, use this only for skewed workloads as uniform trace can be // created (see create_uni_trace) int parse_trace(char* path, struct spacetime_trace_command** cmds, int worker_gid) { FILE* fp; ssize_t read; size_t len = 0; char* ptr; char* word; char* saveptr; char* line = NULL; int rmws = 0; int writes = 0; int cmd_count = 0; uint32_t hottest_key_counter = 0; uint32_t ten_hottest_keys_counter = 0; uint32_t twenty_hottest_keys_counter = 0; fp = fopen(path, "r"); if (fp == NULL) { printf("ERROR: Cannot open file: %s\n", path); exit(EXIT_FAILURE); } while ((read = getline(&line, &len, fp)) != -1) cmd_count++; // printf("File %s has %d lines \n", path, cmd_count); fclose(fp); if (line) free(line); len = 0; line = NULL; fp = fopen(path, "r"); if (fp == NULL) { printf("ERROR: Cannot open file: %s\n", path); exit(EXIT_FAILURE); } (*cmds) = malloc((cmd_count + 1) * sizeof(struct spacetime_trace_command)); // Initialize random with a seed based on local time and a worker / machine id srand((unsigned int)(time(NULL) + worker_gid * 7)); int debug_cnt = 0; // parse file line by line and insert trace to cmd. for (int i = 0; i < cmd_count; i++) { if ((read = getline(&line, &len, fp)) == -1) { printf("ERROR: Problem while reading the trace!\n"); exit(1); } int word_count = 0; assert(word_count == 0); word = strtok_r(line, " ", &saveptr); // Before reading the request deside if it's gone be read or write (*cmds)[i].opcode = (uint8_t)(update_ratio == 1000 || ((rand() % 1000 < update_ratio)) ? ST_OP_PUT : ST_OP_GET); if (ENABLE_RMWs && (*cmds)[i].opcode == ST_OP_PUT) (*cmds)[i].opcode = (uint8_t)(rmw_ratio == 1000 || ((rand() % 1000 < rmw_ratio)) ? ST_OP_RMW : ST_OP_PUT); if ((*cmds)[i].opcode == ST_OP_PUT) writes++; if ((*cmds)[i].opcode == ST_OP_RMW) rmws++; while (word != NULL) { if (word[strlen(word) - 1] == '\n') word[strlen(word) - 1] = 0; if (word_count == 0) { uint32_t key_id = (uint32_t)strtoul(word, &ptr, 10); if (key_id == 0) hottest_key_counter++; if (key_id < 10) ten_hottest_keys_counter++; if (key_id < 20) twenty_hottest_keys_counter++; uint128 key_hash = CityHash128((char*)&(key_id), 4); // memcpy(&(*cmds)[i].key_hash, &key_hash, 16); // this is // for 16B keys memcpy(&(*cmds)[i].key_hash, &((uint64_t*)&key_hash)[1], 8); // this is for 8B keys (*cmds)[i].key_id = (uint8_t)(key_id < 255 ? key_id : ST_KEY_ID_255_OR_HIGHER); debug_cnt++; } word_count++; word = strtok_r(NULL, " ", &saveptr); if (word == NULL && word_count != 1) { printf("Client %d Error: Reached word %d in line %d : %s \n", worker_gid, word_count, i, line); assert(0); } } } if (worker_gid % num_workers == 0) { printf( "Trace size: %d | Hottest key (10 | 20 keys): %.2f%% (%.2f | %.2f " "%%)\n", cmd_count, (100 * hottest_key_counter / (double)cmd_count), (100 * ten_hottest_keys_counter / (double)cmd_count), (100 * twenty_hottest_keys_counter / (double)cmd_count)); printf("Update Ratio: %.2f%% (Writes|RMWs: %.2f%%|%.2f%%)\n", (double)((writes + rmws) * 100) / cmd_count, (double)(writes * 100) / cmd_count, (double)(rmws * 100) / cmd_count); } (*cmds)[cmd_count].opcode = NOP; // printf("Thread %d Trace w_size: %d, debug counter %d hot keys %d, cold keys // %d \n",l_id, cmd_count, debug_cnt, // t_stats[l_id].hot_keys_per_trace, t_stats[l_id].cold_keys_per_trace // ); assert(cmd_count == debug_cnt); fclose(fp); if (line) free(line); return cmd_count; } void trace_init(struct spacetime_trace_command** trace, uint16_t worker_gid) { // create the trace path path if (FEED_FROM_TRACE == 1) { char local_client_id[6]; char machine_num[4]; // get / create path for the trace sprintf(local_client_id, "%d", worker_gid % num_workers); sprintf(machine_num, "%d", machine_id); char path[2048]; char cwd[1024]; char* was_successful = getcwd(cwd, sizeof(cwd)); if (!was_successful) { printf("ERROR: getcwd failed!\n"); exit(EXIT_FAILURE); } double zipf_exponent = ZIPF_EXPONENT_OF_TRACE / 100.0; snprintf(path, sizeof(path), "%s%s%04d%s%.2f%s", cwd, "/../../traces/current-splitted-traces/t_", worker_gid, "_a_", zipf_exponent, ".txt"); // initialize the command array from the trace file parse_trace(path, trace, worker_gid); } else create_uni_trace(trace, worker_gid); } // set up the OPS buffers void setup_kvs_buffs(spacetime_op_t** ops, spacetime_inv_t** inv_recv_ops, spacetime_ack_t** ack_recv_ops, spacetime_val_t** val_recv_ops) { *ops = memalign(4096, MAX_BATCH_KVS_OPS_SIZE * (sizeof(spacetime_op_t))); memset(*ops, 0, MAX_BATCH_KVS_OPS_SIZE * (sizeof(spacetime_op_t))); assert(ops != NULL); // Dirty way to support ACKs that might be as big as INVs uint16_t ack_size = ENABLE_RMWs ? sizeof(spacetime_inv_t) : sizeof(spacetime_ack_t); spacetime_inv_t** rmw_ack_r_ops = (spacetime_inv_t**)ack_recv_ops; /// Network ops /// TODO should we memalign aswell? uint32_t no_ops = (uint32_t)(credits_num * MAX_REMOTE_MACHINES * max_coalesce); // credits * remote_machines * max_req_coalesce // uint32_t no_ops = (uint32_t) (credits_num * remote_machine_num * // max_coalesce); //credits * remote_machines * max_req_coalesce *inv_recv_ops = (spacetime_inv_t*)malloc(no_ops * sizeof(spacetime_inv_t)); *ack_recv_ops = (spacetime_ack_t*)malloc(no_ops * ack_size); *val_recv_ops = (spacetime_val_t*)malloc( no_ops * sizeof(spacetime_val_t)); /* Batch of incoming broadcasts for the Cache*/ assert(*inv_recv_ops != NULL && *ack_recv_ops != NULL && *val_recv_ops != NULL); memset(*inv_recv_ops, 0, no_ops * sizeof(spacetime_inv_t)); memset(*ack_recv_ops, 0, no_ops * sizeof(spacetime_ack_t)); memset(*val_recv_ops, 0, no_ops * sizeof(spacetime_val_t)); for (int i = 0; i < no_ops; ++i) { (*val_recv_ops)[i].opcode = ST_EMPTY; (*inv_recv_ops)[i].op_meta.opcode = ST_EMPTY; if (ENABLE_RMWs == 0) (*ack_recv_ops)[i].opcode = ST_EMPTY; else (*rmw_ack_r_ops)[i].op_meta.opcode = ST_EMPTY; } for (int i = 0; i < MAX_BATCH_KVS_OPS_SIZE; ++i) { (*ops)[i].op_meta.opcode = ST_EMPTY; (*ops)[i].op_meta.state = ST_EMPTY; } } ================================================ FILE: src/mica-herd/city.c ================================================ // city.c - cityhash-c // CityHash on C // Copyright (c) 2011-2012, Alexander Nusov // // - original copyright notice - // Copyright (c) 2011 Google, Inc. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // // CityHash, by Geoff Pike and Jyrki Alakuijala // // This file provides CityHash64() and related functions. // // It's probably possible to create even faster hash functions by // writing a program that systematically explores some of the space of // possible hash functions, by using SIMD instructions, or by // compromising on hash quality. #include "city.h" #include static uint64 UNALIGNED_LOAD64(const char* p) { uint64 result; memcpy(&result, p, sizeof(result)); return result; } static uint32 UNALIGNED_LOAD32(const char* p) { uint32 result; memcpy(&result, p, sizeof(result)); return result; } #if !defined(WORDS_BIGENDIAN) #define uint32_in_expected_order(x) (x) #define uint64_in_expected_order(x) (x) #else #ifdef _MSC_VER #include #define bswap_32(x) _byteswap_ulong(x) #define bswap_64(x) _byteswap_uint64(x) #elif defined(__APPLE__) // Mac OS X / Darwin features #include #define bswap_32(x) OSSwapInt32(x) #define bswap_64(x) OSSwapInt64(x) #else #include #endif #define uint32_in_expected_order(x) (bswap_32(x)) #define uint64_in_expected_order(x) (bswap_64(x)) #endif // WORDS_BIGENDIAN #if !defined(LIKELY) #if HAVE_BUILTIN_EXPECT #define LIKELY(x) (__builtin_expect(!!(x), 1)) #else #define LIKELY(x) (x) #endif #endif static uint64 Fetch64(const char* p) { return uint64_in_expected_order(UNALIGNED_LOAD64(p)); } static uint32 Fetch32(const char* p) { return uint32_in_expected_order(UNALIGNED_LOAD32(p)); } // Some primes between 2^63 and 2^64 for various uses. static const uint64 k0 = 0xc3a5c85c97cb3127ULL; static const uint64 k1 = 0xb492b66fbe98f273ULL; static const uint64 k2 = 0x9ae16a3b2f90404fULL; static const uint64 k3 = 0xc949d7c7509e6557ULL; // Hash 128 input bits down to 64 bits of output. // This is intended to be a reasonably good hash function. static inline uint64 Hash128to64(const uint128 x) { // Murmur-inspired hashing. const uint64 kMul = 0x9ddfea08eb382d69ULL; uint64 a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul; a ^= (a >> 47); uint64 b = (Uint128High64(x) ^ a) * kMul; b ^= (b >> 47); b *= kMul; return b; } // Bitwise right rotate. Normally this will compile to a single // instruction, especially if the shift is a manifest constant. static uint64 Rotate(uint64 val, int shift) { // Avoid shifting by 64: doing so yields an undefined result. return shift == 0 ? val : ((val >> shift) | (val << (64 - shift))); } // Equivalent to Rotate(), but requires the second arg to be non-zero. // On x86-64, and probably others, it's possible for this to compile // to a single instruction if both args are already in registers. static uint64 RotateByAtLeast1(uint64 val, int shift) { return (val >> shift) | (val << (64 - shift)); } static uint64 ShiftMix(uint64 val) { return val ^ (val >> 47); } static uint64 HashLen16(uint64 u, uint64 v) { uint128 result; result.first = u; result.second = v; return Hash128to64(result); } static uint64 HashLen0to16(const char* s, size_t len) { if (len > 8) { uint64 a = Fetch64(s); uint64 b = Fetch64(s + len - 8); return HashLen16(a, RotateByAtLeast1(b + len, (int)len)) ^ b; } if (len >= 4) { uint64 a = Fetch32(s); return HashLen16(len + (a << 3), Fetch32(s + len - 4)); } if (len > 0) { uint8 a = (uint8)s[0]; uint8 b = (uint8)s[len >> 1]; uint8 c = (uint8)s[len - 1]; uint32 y = (uint32)(a) + ((uint32)(b) << 8); uint32 z = (uint32)len + ((uint32)(c) << 2); return ShiftMix(y * k2 ^ z * k3) * k2; } return k2; } // This probably works well for 16-byte strings as well, but it may be overkill // in that case. static uint64 HashLen17to32(const char* s, size_t len) { uint64 a = Fetch64(s) * k1; uint64 b = Fetch64(s + 8); uint64 c = Fetch64(s + len - 8) * k2; uint64 d = Fetch64(s + len - 16) * k0; return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d, a + Rotate(b ^ k3, 20) - c + len); } // Return a 16-byte hash for 48 bytes. Quick and dirty. // Callers do best to use "random-looking" values for a and b. // static pair WeakHashLen32WithSeeds( uint128 WeakHashLen32WithSeeds6(uint64 w, uint64 x, uint64 y, uint64 z, uint64 a, uint64 b) { a += w; b = Rotate(b + a + z, 21); uint64 c = a; a += x; a += y; b += Rotate(a, 44); uint128 result; result.first = (uint64)(a + z); result.second = (uint64)(b + c); return result; } // Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty. // static pair WeakHashLen32WithSeeds( uint128 WeakHashLen32WithSeeds(const char* s, uint64 a, uint64 b) { return WeakHashLen32WithSeeds6(Fetch64(s), Fetch64(s + 8), Fetch64(s + 16), Fetch64(s + 24), a, b); } // Return an 8-byte hash for 33 to 64 bytes. static uint64 HashLen33to64(const char* s, size_t len) { uint64 z = Fetch64(s + 24); uint64 a = Fetch64(s) + (len + Fetch64(s + len - 16)) * k0; uint64 b = Rotate(a + z, 52); uint64 c = Rotate(a, 37); a += Fetch64(s + 8); c += Rotate(a, 7); a += Fetch64(s + 16); uint64 vf = a + z; uint64 vs = b + Rotate(a, 31) + c; a = Fetch64(s + 16) + Fetch64(s + len - 32); z = Fetch64(s + len - 8); b = Rotate(a + z, 52); c = Rotate(a, 37); a += Fetch64(s + len - 24); c += Rotate(a, 7); a += Fetch64(s + len - 16); uint64 wf = a + z; uint64 ws = b + Rotate(a, 31) + c; uint64 r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0); return ShiftMix(r * k0 + vs) * k2; } uint64 CityHash64(const char* s, size_t len) { if (len <= 32) { if (len <= 16) { return HashLen0to16(s, len); } else { return HashLen17to32(s, len); } } else if (len <= 64) { return HashLen33to64(s, len); } // For strings over 64 bytes we hash the end first, and then as we // loop we keep 56 bytes of state: v, w, x, y, and z. uint64 x = Fetch64(s + len - 40); uint64 y = Fetch64(s + len - 16) + Fetch64(s + len - 56); uint64 z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24)); uint64 temp; uint128 v = WeakHashLen32WithSeeds(s + len - 64, len, z); uint128 w = WeakHashLen32WithSeeds(s + len - 32, y + k1, x); x = x * k1 + Fetch64(s); // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks. len = (len - 1) & ~(size_t)(63); do { x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1; y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; x ^= w.second; y += v.first + Fetch64(s + 40); z = Rotate(z + w.first, 33) * k1; v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); temp = z; z = x; x = temp; s += 64; len -= 64; } while (len != 0); return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z, HashLen16(v.second, w.second) + x); } uint64 CityHash64WithSeed(const char* s, size_t len, uint64 seed) { return CityHash64WithSeeds(s, len, k2, seed); } uint64 CityHash64WithSeeds(const char* s, size_t len, uint64 seed0, uint64 seed1) { return HashLen16(CityHash64(s, len) - seed0, seed1); } // A subroutine for CityHash128(). Returns a decent 128-bit hash for strings // of any length representable in signed long. Based on City and Murmur. static uint128 CityMurmur(const char* s, size_t len, uint128 seed) { uint64 a = Uint128Low64(seed); uint64 b = Uint128High64(seed); uint64 c = 0; uint64 d = 0; signed long l = (signed long)(len - 16); if (l <= 0) { // len <= 16 a = ShiftMix(a * k1) * k1; c = b * k1 + HashLen0to16(s, len); d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c)); } else { // len > 16 c = HashLen16(Fetch64(s + len - 8) + k1, a); d = HashLen16(b + len, c + Fetch64(s + len - 16)); a += d; do { a ^= ShiftMix(Fetch64(s) * k1) * k1; a *= k1; b ^= a; c ^= ShiftMix(Fetch64(s + 8) * k1) * k1; c *= k1; d ^= c; s += 16; l -= 16; } while (l > 0); } a = HashLen16(a, c); b = HashLen16(d, b); uint128 result; result.first = (uint64)(a ^ b); result.second = (uint64)(HashLen16(b, a)); return result; } uint128 CityHash128WithSeed(const char* s, size_t len, uint128 seed) { if (len < 128) { return CityMurmur(s, len, seed); } // We expect len >= 128 to be the common case. Keep 56 bytes of state: // v, w, x, y, and z. uint128 v, w; uint64 x = Uint128Low64(seed); uint64 y = Uint128High64(seed); uint64 z = len * k1; uint64 temp; v.first = Rotate(y ^ k1, 49) * k1 + Fetch64(s); v.second = Rotate(v.first, 42) * k1 + Fetch64(s + 8); w.first = Rotate(y + z, 35) * k1 + x; w.second = Rotate(x + Fetch64(s + 88), 53) * k1; // This is the same inner loop as CityHash64(), manually unrolled. do { x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1; y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; x ^= w.second; y += v.first + Fetch64(s + 40); z = Rotate(z + w.first, 33) * k1; v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); temp = z; z = x; x = temp; s += 64; x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1; y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1; x ^= w.second; y += v.first + Fetch64(s + 40); z = Rotate(z + w.first, 33) * k1; v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first); w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16)); temp = z; z = x; x = temp; s += 64; len -= 128; } while (LIKELY(len >= 128)); x += Rotate(v.first + z, 49) * k0; z += Rotate(w.first, 37) * k0; // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s. size_t tail_done; for (tail_done = 0; tail_done < len;) { tail_done += 32; y = Rotate(x + y, 42) * k0 + v.second; w.first += Fetch64(s + len - tail_done + 16); x = x * k0 + w.first; z += w.second + Fetch64(s + len - tail_done); w.second += v.first; v = WeakHashLen32WithSeeds(s + len - tail_done, v.first + z, v.second); } // At this point our 56 bytes of state should contain more than // enough information for a strong 128-bit hash. We use two // different 56-byte-to-8-byte hashes to get a 16-byte final result. x = HashLen16(x, v.first); y = HashLen16(y + z, w.first); uint128 result; result.first = (uint64)(HashLen16(x + v.second, w.second) + y); result.second = (uint64)HashLen16(x + w.second, y + v.second); return result; } uint128 CityHash128(const char* s, size_t len) { uint128 r; if (len >= 16) { r.first = (uint64)(Fetch64(s) ^ k3); r.second = (uint64)(Fetch64(s + 8)); return CityHash128WithSeed(s + 16, len - 16, r); } else if (len >= 8) { r.first = (uint64)(Fetch64(s) ^ (len * k0)); r.second = (uint64)(Fetch64(s + len - 8) ^ k1); return CityHash128WithSeed(NULL, 0, r); } else { r.first = (uint64)k0; r.second = (uint64)k1; return CityHash128WithSeed(s, len, r); } } #ifdef __SSE4_2__ #include #include "citycrc.h" // Requires len >= 240. static void CityHashCrc256Long(const char* s, size_t len, uint32 seed, uint64* result) { uint64 a = Fetch64(s + 56) + k0; uint64 b = Fetch64(s + 96) + k0; uint64 c = result[0] = HashLen16(b, len); uint64 d = result[1] = Fetch64(s + 120) * k0 + len; uint64 e = Fetch64(s + 184) + seed; uint64 f = seed; uint64 g = 0; uint64 h = 0; uint64 i = 0; uint64 j = 0; uint64 t = c + d; // 240 bytes of input per iter. size_t iters = len / 240; len -= iters * 240; do { #define CHUNK(multiplier, z) \ { \ uint64 old_a = a; \ a = Rotate(b, 41 ^ z) * multiplier + Fetch64(s); \ b = Rotate(c, 27 ^ z) * multiplier + Fetch64(s + 8); \ c = Rotate(d, 41 ^ z) * multiplier + Fetch64(s + 16); \ d = Rotate(e, 33 ^ z) * multiplier + Fetch64(s + 24); \ e = Rotate(t, 25 ^ z) * multiplier + Fetch64(s + 32); \ t = old_a; \ } \ f = _mm_crc32_u64(f, a); \ g = _mm_crc32_u64(g, b); \ h = _mm_crc32_u64(h, c); \ i = _mm_crc32_u64(i, d); \ j = _mm_crc32_u64(j, e); \ s += 40 CHUNK(1, 1); CHUNK(k0, 0); CHUNK(1, 1); CHUNK(k0, 0); CHUNK(1, 1); CHUNK(k0, 0); } while (--iters > 0); while (len >= 40) { CHUNK(k0, 0); len -= 40; } if (len > 0) { s = s + len - 40; CHUNK(k0, 0); } j += i << 32; a = HashLen16(a, j); h += g << 32; b += h; c = HashLen16(c, f) + i; d = HashLen16(d, e + result[0]); j += e; i += HashLen16(h, t); e = HashLen16(a, d) + j; f = HashLen16(b, c) + a; g = HashLen16(j, i) + c; result[0] = e + f + g + h; a = ShiftMix((a + g) * k0) * k0 + b; result[1] += a + result[0]; a = ShiftMix(a * k0) * k0 + c; result[2] = a + result[1]; a = ShiftMix((a + e) * k0) * k0; result[3] = a + result[2]; } // Requires len < 240. static void CityHashCrc256Short(const char* s, size_t len, uint64* result) { char buf[240]; memcpy(buf, s, len); memset(buf + len, 0, 240 - len); CityHashCrc256Long(buf, 240, ~(uint32)(len), result); } void CityHashCrc256(const char* s, size_t len, uint64* result) { if (LIKELY(len >= 240)) { CityHashCrc256Long(s, len, 0, result); } else { CityHashCrc256Short(s, len, result); } } uint128 CityHashCrc128WithSeed(const char* s, size_t len, uint128 seed) { if (len <= 900) { return CityHash128WithSeed(s, len, seed); } else { uint64 result[4]; CityHashCrc256(s, len, result); uint64 u = Uint128High64(seed) + result[0]; uint64 v = Uint128Low64(seed) + result[1]; uint128 crc; crc.first = (uint64)(HashLen16(u, v + result[2])); crc.second = (uint64)(HashLen16(Rotate(v, 32), u * k0 + result[3])); return crc; } } uint128 CityHashCrc128(const char* s, size_t len) { if (len <= 900) { return CityHash128(s, len); } else { uint64 result[4]; CityHashCrc256(s, len, result); uint128 crc; crc.first = (uint64)result[2]; crc.second = (uint64)result[3]; return crc; } } #endif ================================================ FILE: src/mica-herd/herd.c ================================================ #include "hrd.h" /* Every thread creates a TCP connection to the registry only once. */ __thread memcached_st* memc = NULL; /* * Finds the port with rank `port_index` (0-based) in the list of ENABLED ports. * Fills its device id and device-local port id (1-based) into the supplied * control block. */ char dev_name[50]; struct ibv_device* hrd_resolve_port_index(struct hrd_ud_ctrl_blk* cb, int port_index) { struct ibv_device** dev_list; int num_devices = 0; assert(port_index >= 0); cb->device_id = -1; cb->dev_port_id = -1; dev_list = ibv_get_device_list(&num_devices); CPE(!dev_list, "HRD: Failed to get IB devices list", 0); int ports_to_discover = port_index; int dev_i; for (dev_i = 0; dev_i < num_devices; dev_i++) { if (strcmp(dev_list[dev_i]->name, dev_name) != 0) continue; struct ibv_context* ctx = ibv_open_device(dev_list[dev_i]); CPE(!ctx, "HRD: Couldn't open device", 0); struct ibv_device_attr device_attr; memset(&device_attr, 0, sizeof(device_attr)); if (ibv_query_device(ctx, &device_attr)) { printf("HRD: Could not query device: %d\n", dev_i); exit(-1); } uint8_t port_i; for (port_i = 1; port_i <= device_attr.phys_port_cnt; port_i++) { /* Count this port only if it is enabled */ struct ibv_port_attr port_attr; if (ibv_query_port(ctx, port_i, &port_attr) != 0) { printf("HRD: Could not query port %d of device %d\n", port_i, dev_i); exit(-1); } if (port_attr.phys_state != IBV_PORT_ACTIVE && port_attr.phys_state != IBV_PORT_ACTIVE_DEFER) { #ifndef __cplusplus printf("HRD: Ignoring port %d of device %d. State is %s\n", port_i, dev_i, ibv_port_state_str(port_attr.phys_state)); #else printf("HRD: Ignoring port %d of device %d. State is %s\n", port_i, dev_i, ibv_port_state_str((ibv_port_state)port_attr.phys_state)); #endif continue; } if (ports_to_discover == 0) { // printf("HRD: port index %d resolved to device %d, port %d\n", // port_index, dev_i, port_i); /* Fill the device ID and device-local port ID */ cb->device_id = dev_i; cb->dev_port_id = port_i; if (ibv_close_device(ctx)) { fprintf(stderr, "HRD: Couldn't release context\n"); assert(false); } return dev_list[cb->device_id]; } ports_to_discover--; } if (ibv_close_device(ctx)) { fprintf(stderr, "HRD: Couldn't release context\n"); assert(false); } } /* If we come here, port resolution failed */ assert(cb->device_id == -1 && cb->dev_port_id == -1); fprintf(stderr, "HRD: Invalid port index %d. Exiting.\n", port_index); exit(-1); } /* Allocate SHM with @shm_key, and save the shmid into @shm_id_ret */ void* hrd_malloc_socket(int shm_key, uint64_t size, int socket_id) { int shmid; int shm_flag = IPC_CREAT | IPC_EXCL | 0666 | (USE_HUGE_PAGES == 1 ? SHM_HUGETLB : 0); shmid = shmget(shm_key, size, shm_flag); if (shmid == -1) { switch (errno) { case EACCES: colored_printf(RED, "HRD: SHM malloc error: Insufficient permissions." " (SHM key = %d)\n", shm_key); break; case EEXIST: colored_printf(RED, "HRD: SHM malloc error: Already exists." " (SHM key = %d)\n", shm_key); break; case EINVAL: colored_printf(RED, "HRD: SHM malloc error: SHMMAX/SHMIN mismatch." " (SHM key = %d, size = %lu)\n", shm_key, size); break; case ENOMEM: colored_printf(RED, "HRD: SHM malloc error: Insufficient memory." " (SHM key = %d, size = %lu)\n", shm_key, size); break; case ENOENT: colored_printf(RED, "HRD: SHM malloc error: No segment exists for the given " "key, and IPC_CREAT was not specified." " (SHM key = %d, size = %lu)\n", shm_key, size); break; case ENOSPC: colored_printf( RED, "HRD: SHM malloc error: All possible shared memory IDs have been " "taken or the limit of shared memory is exceeded." " (SHM key = %d, size = %lu)\n", shm_key, size); break; case EPERM: colored_printf(RED, "HRD: SHM malloc error: The SHM_HUGETLB flag was " "specified, but the caller was not privileged" " (SHM key = %d, size = %lu)\n", shm_key, size); break; case ENFILE: colored_printf(RED, "HRD: SHM malloc error: The system-wide limit on the " "total number of open files has been reached." " (SHM key = %d, size = %lu)\n", shm_key, size); break; default: colored_printf(RED, "HRD: SHM malloc error: A wild SHM error: %s.\n", strerror(errno)); break; } assert(false); } void* buf = shmat(shmid, NULL, 0); if (buf == NULL) { printf("HRD: SHM malloc error: shmat() failed for key %d\n", shm_key); exit(-1); } /* Bind the buffer to this socket */ const unsigned long nodemask = (1 << socket_id); int ret = mbind(buf, size, MPOL_BIND, &nodemask, 32, 0); if (ret != 0) { printf("HRD: SHM malloc error. mbind() failed for key %d\n", shm_key); exit(-1); } // vasilis- try to take advantage of TLB coalescing, if it is there if (LEVERAGE_TLB_COALESCING) { uint64_t page_no = CEILING(size, HUGE_PAGE_SIZE) - 1; for (uint64_t i = 0; i < page_no; i++) { uint8_t* buf_ptr = ((uint8_t*)buf) + (i * HUGE_PAGE_SIZE); memset(buf_ptr, 0, 1); } } return buf; } /* Free shm @shm_key and @shm_buf. Return 0 on success, else -1. */ int hrd_free(int shm_key, void* shm_buf) { int ret; int shmid = shmget(shm_key, 0, 0); if (shmid == -1) { switch (errno) { case EACCES: printf( "HRD: SHM free error: Insufficient permissions." " (SHM key = %d)\n", shm_key); break; case ENOENT: printf("HRD: SHM free error: No such SHM key. (SHM key = %d)\n", shm_key); break; default: printf("HRD: SHM free error: A wild SHM error: %s\n", strerror(errno)); break; } return -1; } ret = shmctl(shmid, IPC_RMID, NULL); if (ret != 0) { printf("HRD: SHM free error: shmctl() failed for key %d\n", shm_key); exit(-1); } ret = shmdt(shm_buf); if (ret != 0) { printf("HRD: SHM free error: shmdt() failed for key %d\n", shm_key); exit(-1); } return 0; } /* Get the LID of a port on the device specified by @ctx */ uint16_t hrd_get_local_lid(struct ibv_context* ctx, int dev_port_id) { assert(ctx != NULL && dev_port_id >= 1); struct ibv_port_attr attr; if (ibv_query_port(ctx, dev_port_id, &attr)) { printf("HRD: ibv_query_port on port %d of device %s failed! Exiting.\n", dev_port_id, ibv_get_device_name(ctx->device)); assert(false); } return attr.lid; } /* Return the environment variable @name if it is set. Exit if not. */ char* hrd_getenv(const char* name) { char* env = getenv(name); if (env == NULL) { fprintf(stderr, "Environment variable %s not set\n", name); assert(false); } return env; } memcached_st* hrd_create_memc() { memcached_server_st* servers = NULL; memcached_st* memc = memcached_create(NULL); memcached_return rc; memc = memcached_create(NULL); char* registry_ip = hrd_getenv("HRD_REGISTRY_IP"); // printf("Appending server with IP: %s \n", registry_ip); servers = memcached_server_list_append(servers, registry_ip, MEMCACHED_DEFAULT_PORT, &rc); // Pushes an array of memcached_server_st into the memcached_st structure. // These servers will be placed at the end. rc = memcached_server_push(memc, servers); CPE(rc != MEMCACHED_SUCCESS, "Couldn't add memcached server.\n", -1); return memc; } /* * Insert key -> value mapping into memcached running at HRD_REGISTRY_IP. */ void hrd_publish(const char* key, void* value, int len) { assert(key != NULL && value != NULL && len > 0); memcached_return rc; if (memc == NULL) { memc = hrd_create_memc(); } rc = memcached_set(memc, key, strlen(key), (const char*)value, len, (time_t)0, (uint32_t)0); if (rc != MEMCACHED_SUCCESS) { char* registry_ip = hrd_getenv("HRD_REGISTRY_IP"); fprintf(stderr, "\tHRD: Failed to publish key %s to memcached. Error %s. " "Reg IP = %s\n", key, memcached_strerror(memc, rc), registry_ip); exit(-1); } } /* * Get the value associated with "key" into "value", and return the length * of the value. If the key is not found, return NULL and len -1. For all * other errors, terminate. * * This function sometimes gets called in a polling loop - ensure that there * are no memory leaks or unterminated memcached connections! We don't need * to free() the resul of getenv() since it points to a string in the process * environment. */ int hrd_get_published(const char* key, void** value) { assert(key != NULL); if (memc == NULL) { memc = hrd_create_memc(); } memcached_return rc; size_t value_length; uint32_t flags; *value = memcached_get(memc, key, strlen(key), &value_length, &flags, &rc); if (rc == MEMCACHED_SUCCESS) { return (int)value_length; } else if (rc == MEMCACHED_NOTFOUND) { assert(*value == NULL); return -1; } else { char* registry_ip = hrd_getenv("HRD_REGISTRY_IP"); // char *registry_ip = is_client == 1 ? remote_IP : local_IP; fprintf(stderr, "HRD: Error finding value for key \"%s\": %s. " "Reg IP = %s\n", key, memcached_strerror(memc, rc), registry_ip); exit(-1); } /* Never reached */ assert(false); } /* * If @prealloc_conn_buf != NULL, @conn_buf_size is the size of the preallocated * buffer. If @prealloc_conn_buf == NULL, @conn_buf_size is the size of the * new buffer to create. */ struct hrd_ud_ctrl_blk* hrd_ud_ctrl_blk_init(int local_hid, int port_index, int numa_node_id, /* -1 means don't use hugepages */ int num_dgram_qps, int dgram_buf_size, int dgram_buf_shm_key, int* recv_q_depth, int* send_q_depth) { // colored_printf(RED,"HRD: creating control block %d: port %d, socket %d, " // "conn qps %d, UC %d, conn buf %d bytes (key %d), " // "dgram qps %d, dgram buf %d bytes (key %d)\n", // local_hid, port_index, numa_node_id, // num_conn_qps, use_uc, conn_buf_size, conn_buf_shm_key, // num_dgram_qps, dgram_buf_size, dgram_buf_shm_key); /* * Check arguments for sanity. * @local_hid can be anything: it's just control block identifier that is * useful in printing debug info. */ assert(port_index >= 0 && port_index <= 16); assert(numa_node_id >= -1 && numa_node_id <= 8); assert(num_dgram_qps >= 0 && num_dgram_qps <= M_2); assert(dgram_buf_size >= 0 && dgram_buf_size <= M_1024); if (num_dgram_qps == 0) { colored_printf(RED, "HRD: Control block initialization without QPs. Are you" " sure you want to do this?\n"); assert(false); } struct hrd_ud_ctrl_blk* cb = (struct hrd_ud_ctrl_blk*)malloc(sizeof(struct hrd_ud_ctrl_blk)); memset(cb, 0, sizeof(struct hrd_ud_ctrl_blk)); /* Fill in the control block */ cb->local_hid = local_hid; cb->numa_node_id = numa_node_id; cb->num_dgram_qps = num_dgram_qps; cb->dgram_buf_shm_key = dgram_buf_shm_key; cb->recv_q_depth = recv_q_depth; cb->send_q_depth = send_q_depth; /* Get the device to use. This fills in cb->device_id and cb->dev_port_id */ struct ibv_device* ib_dev = hrd_resolve_port_index(cb, port_index); CPE(!ib_dev, "HRD: IB device not found", 0); /* Use a single device context and PD for all QPs */ cb->ctx = ibv_open_device(ib_dev); CPE(!cb->ctx, "HRD: Couldn't get context", 0); cb->pd = ibv_alloc_pd(cb->ctx); CPE(!cb->pd, "HRD: Couldn't allocate PD", 0); int ib_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC; /* * Create datagram QPs and transition them RTS. * Create and register datagram RDMA buffer. */ if (num_dgram_qps >= 1) { cb->dgram_qp = (struct ibv_qp**)malloc(num_dgram_qps * sizeof(struct ibv_qp*)); cb->dgram_send_cq = (struct ibv_cq**)malloc(num_dgram_qps * sizeof(struct ibv_cq*)); cb->dgram_recv_cq = (struct ibv_cq**)malloc(num_dgram_qps * sizeof(struct ibv_cq*)); assert(cb->dgram_qp != NULL && cb->dgram_send_cq != NULL && cb->dgram_recv_cq != NULL); hrd_create_dgram_qps(cb); /* Create and register dgram_buf */ int reg_size = 0; if (numa_node_id >= 0) { /* Hugepages */ while (reg_size < dgram_buf_size) { reg_size += M_2; } /* SHM key 0 is hard to free later */ assert(dgram_buf_shm_key >= 1 && dgram_buf_shm_key <= 128); cb->dgram_buf = (volatile uint8_t*)hrd_malloc_socket( dgram_buf_shm_key, reg_size, numa_node_id); } else { reg_size = dgram_buf_size; cb->dgram_buf = (volatile uint8_t*)memalign(4096, reg_size); } assert(cb->dgram_buf != NULL); memset((char*)cb->dgram_buf, 0, reg_size); cb->dgram_buf_mr = ibv_reg_mr(cb->pd, (char*)cb->dgram_buf, reg_size, ib_flags); assert(cb->dgram_buf_mr != NULL); } return cb; } /* Free up the resources taken by @cb. Return -1 if something fails, else 0. */ int hrd_ud_ctrl_blk_destroy(struct hrd_ud_ctrl_blk* cb) { int i; colored_printf(RED, "HRD: Destroying control block %d\n", cb->local_hid); /* Destroy QPs and CQs. QPs must be destroyed before CQs. */ for (i = 0; i < cb->num_dgram_qps; i++) { assert(cb->dgram_send_cq[i] != NULL && cb->dgram_recv_cq[i] != NULL); assert(cb->dgram_qp[i] != NULL); if (ibv_destroy_qp(cb->dgram_qp[i])) { fprintf(stderr, "HRD: Couldn't destroy dgram QP %d\n", i); return -1; } else assert(0); } /* Destroy memory regions */ if (cb->num_dgram_qps > 0) { assert(cb->dgram_buf_mr != NULL && cb->dgram_buf != NULL); if (ibv_dereg_mr(cb->dgram_buf_mr)) { fprintf(stderr, "HRD: Couldn't deregister dgram MR for cb %d\n", cb->local_hid); return -1; } if (cb->numa_node_id >= 0) { if (hrd_free(cb->dgram_buf_shm_key, (void*)cb->dgram_buf)) { fprintf(stderr, "HRD: Error freeing dgram hugepages for cb %d\n", cb->local_hid); } } else { free((void*)cb->dgram_buf); } } /* Destroy protection domain */ if (ibv_dealloc_pd(cb->pd)) { fprintf(stderr, "HRD: Couldn't dealloc PD for cb %d\n", cb->local_hid); return -1; } /* Destroy device context */ if (ibv_close_device(cb->ctx)) { fprintf(stderr, "Couldn't release context for cb %d\n", cb->local_hid); return -1; } colored_printf(RED, "HRD: Control block %d destroyed.\n", cb->local_hid); return 0; } /* Create datagram QPs and transition them to RTS */ void hrd_create_dgram_qps(struct hrd_ud_ctrl_blk* cb) { int i; assert(cb->dgram_qp != NULL && cb->dgram_send_cq != NULL && cb->dgram_recv_cq != NULL && cb->pd != NULL && cb->ctx != NULL); assert(cb->num_dgram_qps >= 1 && cb->dev_port_id >= 1); for (i = 0; i < cb->num_dgram_qps; i++) { cb->dgram_send_cq[i] = ibv_create_cq(cb->ctx, cb->send_q_depth[i], NULL, NULL, 0); assert(cb->dgram_send_cq[i] != NULL); // I am replacing the recv_cq Depth // cb->dgram_recv_cq[i] = ibv_create_cq(cb->ctx, // HRD_Q_DEPTH, NULL, NULL, 0); cb->dgram_recv_cq[i] = ibv_create_cq(cb->ctx, cb->recv_q_depth[i], NULL, NULL, 0); assert(cb->dgram_recv_cq[i] != NULL); // /* Initialize creation attributes */ struct ibv_qp_init_attr create_attr; memset((void*)&create_attr, 0, sizeof(struct ibv_qp_init_attr)); // if (i > 0) printf("The recv queue %d has size %d, the send queue has size // %d\n", i, cb->recv_q_depth[i], cb->send_q_depth[i] ); create_attr.send_cq = cb->dgram_send_cq[i]; create_attr.recv_cq = cb->dgram_recv_cq[i]; create_attr.qp_type = IBV_QPT_UD; create_attr.cap.max_send_wr = cb->send_q_depth[i]; // // printf("Receive q depth %d\n", cb->recv_q_depth); create_attr.cap.max_recv_wr = cb->recv_q_depth[i]; // create_attr.cap.max_send_sge = 1; create_attr.cap.max_recv_sge = 1; create_attr.cap.max_inline_data = HRD_MAX_INLINE; cb->dgram_qp[i] = ibv_create_qp(cb->pd, &create_attr); assert(cb->dgram_qp[i] != NULL); /* INIT state */ struct ibv_qp_attr init_attr; memset((void*)&init_attr, 0, sizeof(struct ibv_qp_attr)); init_attr.qp_state = IBV_QPS_INIT; init_attr.pkey_index = 0; init_attr.port_num = cb->dev_port_id; init_attr.qkey = HRD_DEFAULT_QKEY; if (ibv_modify_qp( cb->dgram_qp[i], &init_attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY)) { fprintf(stderr, "Failed to modify dgram QP to INIT\n"); return; } /* RTR state */ struct ibv_qp_attr rtr_attr; memset((void*)&rtr_attr, 0, sizeof(struct ibv_qp_attr)); rtr_attr.qp_state = IBV_QPS_RTR; if (ibv_modify_qp(cb->dgram_qp[i], &rtr_attr, IBV_QP_STATE)) { fprintf(stderr, "Failed to modify dgram QP to RTR\n"); exit(-1); } /* Reuse rtr_attr for RTS */ rtr_attr.qp_state = IBV_QPS_RTS; rtr_attr.sq_psn = HRD_DEFAULT_PSN; if (ibv_modify_qp(cb->dgram_qp[i], &rtr_attr, IBV_QP_STATE | IBV_QP_SQ_PSN)) { fprintf(stderr, "Failed to modify dgram QP to RTS\n"); exit(-1); } } } void hrd_publish_dgram_qp(struct hrd_ud_ctrl_blk* cb, int n, const char* qp_name, uint8_t sl) { assert(cb != NULL); assert(n >= 0 && n < cb->num_dgram_qps); assert(qp_name != NULL && strlen(qp_name) < HRD_QP_NAME_SIZE - 1); assert(strstr(qp_name, HRD_RESERVED_NAME_PREFIX) == NULL); int len = strlen(qp_name); int i; for (i = 0; i < len; i++) { if (qp_name[i] == ' ') { fprintf(stderr, "HRD: Space not allowed in QP name\n"); exit(-1); } } struct hrd_qp_attr qp_attr; memcpy(qp_attr.name, qp_name, len); qp_attr.name[len] = 0; /* Add the null terminator */ qp_attr.lid = hrd_get_local_lid(cb->dgram_qp[n]->context, cb->dev_port_id); qp_attr.qpn = cb->dgram_qp[n]->qp_num; qp_attr.sl = sl; // ---ROCE---------- if (is_roce == 1) { union ibv_gid ret_gid; ibv_query_gid(cb->ctx, IB_PHYS_PORT, 0, &ret_gid); qp_attr.gid_global_interface_id = ret_gid.global.interface_id; qp_attr.gid_global_subnet_prefix = ret_gid.global.subnet_prefix; } // printf("Publishing datagram qp with name %s \n", qp_attr.name); // hrd_publish(qp_attr.name, &qp_attr, sizeof(struct hrd_qp_attr)); } struct hrd_qp_attr* hrd_get_published_qp(const char* qp_name) { struct hrd_qp_attr* ret; assert(qp_name != NULL && strlen(qp_name) < HRD_QP_NAME_SIZE - 1); assert(strstr(qp_name, HRD_RESERVED_NAME_PREFIX) == NULL); int len = strlen(qp_name); int i; for (i = 0; i < len; i++) { if (qp_name[i] == ' ') { fprintf(stderr, "HRD: Space not allowed in QP name\n"); exit(-1); } } int ret_len = hrd_get_published(qp_name, (void**)&ret); /* * The registry lookup returns only if we get a unique QP for @qp_name, or * if the memcached lookup succeeds but we don't have an entry for @qp_name. */ assert(ret_len == sizeof(struct hrd_qp_attr) || ret_len == -1); return ret; } ////////////////////////// /// Fun-c-print ////////////////////////// /* Like printf, but colorfur. Limited to 1000 characters. */ void colored_printf(color_print_t color, const char* format, ...) { #define RED_LIM 1000 va_list args; int i; char buf1[RED_LIM], buf2[RED_LIM]; memset(buf1, 0, RED_LIM); memset(buf2, 0, RED_LIM); va_start(args, format); /* Marshal the stuff to print in a buffer */ vsnprintf(buf1, RED_LIM, format, args); /* Probably a bad check for buffer overflow */ for (i = RED_LIM - 1; i >= RED_LIM - 50; i--) { assert(buf1[i] == 0); } /* Add markers for red color and reset color */ // snprintf(buf2, 1000, "\033[31m%s\033[0m", buf1); snprintf(buf2, 1000, "\033[31m%s\033[0m", buf1); switch (color) { case YELLOW: snprintf(buf2, 1000, "\033[33m%s\033[0m", buf1); break; case RED: snprintf(buf2, 1000, "\033[31m%s\033[0m", buf1); break; case GREEN: snprintf(buf2, 1000, "\033[32m%s\033[0m", buf1); break; case CYAN: snprintf(buf2, 1000, "\033[36m%s\033[0m", buf1); break; default: printf("Wrong printf color /%d \n", color); assert(false); } /* Probably another bad check for buffer overflow */ for (i = RED_LIM - 1; i >= RED_LIM - 50; i--) { assert(buf2[i] == 0); } printf("%s", buf2); va_end(args); } ================================================ FILE: src/mica-herd/mica.c ================================================ #include "mica.h" #include "hrd.h" int is_power_of_2(int x) { return (x == 1 || x == 2 || x == 4 || x == 8 || x == 16 || x == 32 || x == 64 || x == 128 || x == 256 || x == 512 || x == 1024 || x == 2048 || x == 4096 || x == 8192 || x == 16384 || x == 32768 || x == 65536 || x == 131072 || x == 262144 || x == 524288 || x == 1048576 || x == 2097152 || x == 4194304 || x == 8388608 || x == 16777216 || x == 33554432 || x == 67108864 || x == 134217728 || x == 268435456 || x == 536870912 || x == 1073741824); } void mica_init(struct mica_kv* kv, int instance_id, int node_id, int num_bkts, uint64_t log_cap) { int i, j; /* Verify struct sizes */ assert(sizeof(struct mica_slot) == 8); assert(sizeof(struct mica_key) == 16); assert(sizeof(struct mica_op) % 64 == 0); assert(kv != NULL); assert(node_id == 0 || node_id == 1); /* 16 million buckets = a 1 GB index */ assert(is_power_of_2(num_bkts) == 1 && num_bkts <= M_128); // assert(log_cap > 0 && log_cap <= M_1024 && // log_cap % M_2 == 0 && is_power_of_2(log_cap)); assert(MICA_LOG_BITS >= 24); /* Minimum log size = 16 MB */ // red_printf("mica-herd-herd: Initializing MICA instance %d.\n" // "NUMA node = %d, buckets = %d (size = %u B), log capacity = %d B.\n", // instance_id, // node_id, num_bkts, num_bkts * sizeof(struct mica_bkt), log_cap); if (MICA_DEBUG != 0) { printf( "mica-herd-herd: Debug mode is ON! This might reduce performance.\n"); sleep(2); } /* Initialize metadata and stats */ kv->instance_id = instance_id; kv->num_bkts = num_bkts; kv->bkt_mask = num_bkts - 1; /* num_bkts is power of 2 */ kv->log_cap = log_cap; kv->log_mask = log_cap - 1; /* log_cap is a power of 2 */ kv->log_head = 0; kv->num_insert_op = 0; kv->num_index_evictions = 0; /* Alloc index and initialize all entries to invalid */ // printf("mica-herd-herd: Allocting hash table index for instance %d\n", // instance_id); int ht_index_key = MICA_INDEX_SHM_KEY + instance_id; kv->ht_index = (struct mica_bkt*)hrd_malloc_socket( ht_index_key, num_bkts * sizeof(struct mica_bkt), node_id); for (i = 0; i < num_bkts; i++) { for (j = 0; j < 8; j++) { kv->ht_index[i].slots[j].in_use = 0; } } /* Alloc log */ // printf("mica-herd-herd: Allocting hash table log for instance %d\n", // instance_id); int ht_log_key = MICA_LOG_SHM_KEY + instance_id; kv->ht_log = (uint8_t*)hrd_malloc_socket(ht_log_key, log_cap, node_id); } void mica_insert_one(struct mica_kv* kv, struct mica_op* op, struct mica_resp* resp) { #if MICA_DEBUG == 1 assert(kv != NULL); assert(op != NULL); assert(op->opcode == MICA_OP_PUT); assert(op->val_len > 0 && op->val_len <= MICA_MAX_VALUE); assert(resp != NULL); #endif int i; unsigned int bkt = op->key.bkt & kv->bkt_mask; struct mica_bkt* bkt_ptr = &kv->ht_index[bkt]; unsigned int tag = op->key.tag; #if MICA_DEBUG == 2 mica_print_op(op); #endif kv->num_insert_op++; /* Find a slot to use for this key. If there is a slot with the same * tag as ours, we are sure to find it because the used slots are at * the beginning of the 8-slot array. */ int slot_to_use = -1; for (i = 0; i < 8; i++) { if (bkt_ptr->slots[i].tag == tag || bkt_ptr->slots[i].in_use == 0) { slot_to_use = i; } } /* If no slot found, choose one to evict */ if (slot_to_use == -1) { slot_to_use = tag & 7; /* tag is ~ randomly distributed */ kv->num_index_evictions++; } /* Encode the empty slot */ bkt_ptr->slots[slot_to_use].in_use = 1; bkt_ptr->slots[slot_to_use].offset = kv->log_head; /* Virtual head */ bkt_ptr->slots[slot_to_use].tag = tag; /* Paste the key-value into the log */ uint8_t* log_ptr = &kv->ht_log[kv->log_head & kv->log_mask]; /* Data copied: key, opcode, val_len, value */ int len_to_copy = sizeof(struct mica_key) + sizeof(uint8_t) + sizeof(uint8_t) + KVS_VALUE_SIZE; /// op->val_len; /* Ensure that we don't wrap around in the *virtual* log space even * after 8-byte alignment below.*/ assert((1ULL << MICA_LOG_BITS) - kv->log_head > len_to_copy + 8); memcpy(log_ptr, op, len_to_copy); kv->log_head += len_to_copy; /* Ensure that the key field of each log entry is 8-byte aligned. This * makes subsequent comparisons during GETs faster. */ kv->log_head = (kv->log_head + 7) & ~7; /* If we're close to overflowing in the physical log, wrap around to * the beginning, but go forward in the virtual log. */ if (unlikely(kv->log_cap - kv->log_head <= MICA_MAX_VALUE + 32)) { kv->log_head = (kv->log_head + kv->log_cap) & ~kv->log_mask; colored_printf( RED, "mica-herd-herd: Instance %d wrapping around. Wraps = %llu\n", kv->instance_id, kv->log_head / kv->log_cap); } } /* A fast deterministic way to generate @n ~randomly distributed 16-byte keys */ uint128* mica_gen_keys(int n) { int i; assert(n > 0 && n <= M_1024 / sizeof(uint128)); assert(sizeof(uint128) == 16); // printf("mica-herd-herd: Generating %d keys\n", n); uint128* key_arr = malloc(n * sizeof(uint128)); assert(key_arr != NULL); for (i = 0; i < n; i++) { key_arr[i] = CityHash128((char*)&i, 4); } return key_arr; } ================================================ FILE: src/wings/wings.c ================================================ // // Created by akatsarakis on 22/01/19. // #include "../../include/wings/wings.h" #include #include #include #include #include // implement a Multicast / Unicast channel // Support for: // mulitcast / unicast channel // Coalescing // Variable size msgs? // Selective Signaling // Batching to the NIC // Inlining or not // Batch post receives to the NIC // Mode 1: poll reqs, copy incoming msgs to local buffers and // (p)re-post recvs Mode 2: poll reqs, do not copy msgs and post rcvs // when said // Enable implicit (request - response mode) and explicit (batched) credits // flow control void _wings_setup_send_wr_and_sgl(ud_channel_t* ud_c); void _wings_setup_recv_wr_and_sgl(ud_channel_t* ud_c, struct hrd_ud_ctrl_blk* cb); void _wings_setup_crd_wr_and_sgl(ud_channel_t* ud_c, struct hrd_ud_ctrl_blk* cb); void _wings_setup_incoming_buff_and_post_initial_recvs(ud_channel_t* ud_c); void _wings_ud_channel_init_recv(ud_channel_t* ud_c, struct hrd_ud_ctrl_blk* cb, uint8_t qp_id, volatile uint8_t* incoming_reqs_ptr); void _wings_ud_channel_crd_init(ud_channel_t* ud_c, char* qp_name, ud_channel_t* linked_channel, uint16_t crds_per_channel, uint16_t num_channels, uint8_t channel_id, uint8_t enable_stats, uint8_t enable_prints); void _wings_print_on_off_toggle(uint16_t bin_flag, char* str); void _wings_share_qp_info_via_memcached(ud_channel_t** ud_c_array, uint16_t ud_c_num, dbit_vector_t* shared_rdy_var, int worker_lid, struct hrd_ud_ctrl_blk* cb); void wings_ud_channel_destroy(ud_channel_t* ud_c) { free(ud_c->qp_name); free(ud_c->recv_wc); free(ud_c->remote_qps); free(ud_c->credits_per_channels); if (ud_c->send_pkt_buff != NULL) free(ud_c->send_pkt_buff); if (ud_c->type != CRD && ud_c->max_coalescing > 1) free(ud_c->overflow_msg_buff); if (ud_c->type == CRD) free(ud_c->no_crds_to_send_per_endpoint); } void wings_ud_channel_init(ud_channel_t* ud_c, char* qp_name, enum channel_type type, uint8_t max_coalescing, uint16_t max_req_size, uint16_t small_req_size, uint8_t enable_inlining, uint8_t is_header_only, // Broadcast uint8_t is_bcast, // Credits uint8_t disable_crd_ctrl, uint8_t expl_crd_ctrl, ud_channel_t* linked_channel, uint16_t crds_per_channel, uint16_t num_channels, uint8_t channel_id, // Toggles uint8_t stats_on, uint8_t prints_on) { assert(type != CRD); // if CRD type then used the *_crd_init instead assert(max_coalescing > 0); // To disable coalescing use max_coalescing == 1 assert(channel_id < num_channels); assert(!(disable_crd_ctrl == 1 && expl_crd_ctrl == 1)); // cannot disable crd_ctrl and then set an // explicit credit control assert( disable_crd_ctrl == 1 || linked_channel != NULL); // cannot disable crd_ctrl and then set an crd control channel assert(is_bcast == 0 || is_header_only == 0); assert(small_req_size <= max_req_size); _wings_assert_binary(stats_on); _wings_assert_binary(is_bcast); _wings_assert_binary(prints_on); _wings_assert_binary(expl_crd_ctrl); _wings_assert_binary(is_header_only); _wings_assert_binary(enable_inlining); ud_c->is_header_only = is_header_only; if (ud_c->is_header_only) /// WARNING: hdr_only msgs have an additional 1st B indicating sender_id /// (which must not be taken into account on max_req_size) assert(max_req_size <= 3 * sizeof(uint8_t) && max_coalescing == 1); ud_c->type = type; ud_c->channel_id = channel_id; ud_c->num_channels = num_channels; // num_channels include our own channel ud_c->expl_crd_ctrl = expl_crd_ctrl; ud_c->disable_crd_ctrl = disable_crd_ctrl; ud_c->is_bcast_channel = is_bcast; ud_c->num_crds_per_channel = crds_per_channel; ud_c->channel_providing_crds = linked_channel; ud_c->qp_name = malloc(sizeof(char) * (strlen(qp_name) + 1)); // TODO make sure to destroy this when destroing a ud_c strcpy(ud_c->qp_name, qp_name); ud_c->enable_stats = stats_on; ud_c->enable_prints = prints_on; ud_c->max_coalescing = max_coalescing; ud_c->max_msg_size = (uint16_t)(max_req_size + (ud_c->is_header_only == 1 ? 1 : 0)); // hdr_only msgs have an additional // 1st B indicating sender_id ud_c->small_msg_size = small_req_size == 0 ? ud_c->max_msg_size : small_req_size; ud_c->no_crds_to_send_per_endpoint = NULL; // unused for type != CRD uint16_t remote_channels = (uint16_t)(num_channels - 1); ud_c->is_inlining_enabled = (uint8_t)(ud_c->is_header_only == 1 ? 1 : enable_inlining); if (ud_c->is_header_only == 0 && _wings_ud_send_max_pkt_size(ud_c) > WINGS_MAX_SUPPORTED_INLINING) { if (ud_c->is_inlining_enabled) printf( "Unfortunately, inlining for msgs sizes up to (%d) " "is higher than the supported (%d)\n", _wings_ud_send_max_pkt_size(ud_c), WINGS_MAX_SUPPORTED_INLINING); ud_c->is_inlining_enabled = 0; } ud_c->credits_per_channels = malloc(sizeof(uint16_t) * (num_channels)); for (int i = 0; i < num_channels; ++i) ud_c->credits_per_channels[i] = (uint16_t)(type == REQ && !disable_crd_ctrl ? crds_per_channel : 0); ud_c->max_pcie_bcast_batch = (uint16_t)WINGS_MIN(WINGS_MIN_PCIE_BCAST_BATCH + 1, crds_per_channel); // Warning! use min to avoid resetting the first req prior batching to the NIC // WARNING: todo check why we need to have MIN_PCIE_BCAST_BATCH + 1 instead of // just MIN_PCIE_BCAST_BATCH uint16_t max_msgs_in_pcie_bcast = (uint16_t)(ud_c->max_pcie_bcast_batch * remote_channels); // must be smaller than the q_depth ud_c->max_recv_wrs = (uint16_t)(crds_per_channel * remote_channels); ud_c->max_send_wrs = (uint16_t)(ud_c->is_bcast_channel ? max_msgs_in_pcie_bcast : crds_per_channel * remote_channels); ud_c->ss_granularity = ud_c->is_bcast_channel ? ud_c->max_pcie_bcast_batch : ud_c->max_send_wrs; ud_c->recv_q_depth = ud_c->max_recv_wrs; ud_c->send_q_depth = (uint16_t)(2 * ud_c->ss_granularity * (ud_c->is_bcast_channel ? remote_channels : 1)); ud_c->recv_wc = malloc(sizeof(struct ibv_wc) * ud_c->max_recv_wrs); ud_c->recv_pkt_buff_len = ud_c->max_recv_wrs; ud_c->send_pkt_buff_len = (uint16_t)(ud_c->max_send_wrs * (ud_c->is_inlining_enabled ? 1 : 2)); ud_c->send_pkt_buff = ud_c->is_header_only == 1 ? NULL : malloc(_wings_ud_send_max_pkt_size(ud_c) * ud_c->send_pkt_buff_len); ud_c->overflow_msg_buff = NULL; // Overflow on polling if (ud_c->max_coalescing > 1) { ud_c->num_overflow_msgs = 0; ud_c->enable_overflow_msgs = 1; ud_c->overflow_msg_buff = malloc((size_t)(ud_c->max_msg_size * (ud_c->max_coalescing - 1))); } else { ud_c->num_overflow_msgs = 0; ud_c->enable_overflow_msgs = 0; ud_c->overflow_msg_buff = NULL; } ud_c->send_push_ptr = 0; ud_c->recv_push_ptr = 0; ud_c->recv_pull_ptr = -1; ud_c->total_pkts_send = 0; ud_c->stats.ss_completions = 0; ud_c->stats.recv_total_pkts = 0; ud_c->stats.recv_total_msgs = 0; ud_c->stats.send_total_msgs = 0; ud_c->stats.send_total_pkts = 0; ud_c->stats.send_total_pcie_batches = 0; ud_c->stats.no_stalls_due_to_credits = 0; // Initialize the crd channel as well if (ud_c->expl_crd_ctrl) { char crd_qp_name[1000]; sprintf(crd_qp_name, "\033[1m\033[36mCRD\033[0m-%s", qp_name); _wings_ud_channel_crd_init(linked_channel, crd_qp_name, ud_c, crds_per_channel, num_channels, channel_id, stats_on, prints_on); } ud_c->remote_qps = malloc(sizeof(qp_info_t) * ud_c->num_channels); // The following are set by the *_init_recv function after the creation of // control block and QPs ud_c->qp = NULL; ud_c->pd = NULL; ud_c->qp_id = 0; ud_c->send_cq = NULL; // set by init_recv ud_c->recv_cq = NULL; // set by init_recv ud_c->recv_pkt_buff = NULL; ud_c->send_mem_region = NULL; // set by init_recv // _wings_setup_send_wr_and_sgl(ud_c); // _wings_setup_recv_wr_and_sgl(ud_c, cb); _wings_assert_binary(ud_c->is_header_only); assert(ud_c->max_pcie_bcast_batch <= crds_per_channel); assert(ud_c->is_header_only == 0 || ud_c->is_header_only); } void wings_setup_channel_qps_and_recvs_w_shm_key(ud_channel_t** ud_c_array, uint16_t ud_c_num, dbit_vector_t* shared_rdy_var, uint16_t worker_lid, uint16_t base_shm_key) { uint32_t dgram_buff_size = 0; int* send_q_depths = malloc(ud_c_num * sizeof(int)); int* recv_q_depths = malloc(ud_c_num * sizeof(int)); // Setup Q depths and buff size for incoming pkts for (int i = 0; i < ud_c_num; ++i) { send_q_depths[i] = ud_c_array[i]->send_q_depth; recv_q_depths[i] = ud_c_array[i]->recv_q_depth; dgram_buff_size += ud_c_array[i]->type == CRD || ud_c_array[i]->is_header_only == 1 ? 64 : _wings_ud_recv_max_pkt_size(ud_c_array[i]) * ud_c_array[i]->recv_q_depth; } struct hrd_ud_ctrl_blk* cb = hrd_ud_ctrl_blk_init(worker_lid, 0, -1, // local_hid, port_index, numa_node_id, ud_c_num, dgram_buff_size, // num_dgram_qps, dgram_buf_size base_shm_key + worker_lid, // key recv_q_depths, send_q_depths); // Depth of the dgram RECV, SEND Q for (uint8_t i = 0; i < ud_c_num; ++i) ud_c_array[i]->pd = cb->pd; _wings_share_qp_info_via_memcached(ud_c_array, ud_c_num, shared_rdy_var, worker_lid, cb); volatile uint8_t* incoming_reqs_ptr = cb->dgram_buf; for (uint8_t i = 0; i < ud_c_num; ++i) { // Init recv and setup wrs and sgls of ud_channel _wings_ud_channel_init_recv(ud_c_array[i], cb, (uint8_t)i, incoming_reqs_ptr); incoming_reqs_ptr += ud_c_array[i]->type == CRD || ud_c_array[i]->is_header_only == 1 ? 64 : _wings_ud_recv_max_pkt_size(ud_c_array[i]) * ud_c_array[i]->recv_q_depth; } free(send_q_depths); free(recv_q_depths); for (int i = 0; i < ud_c_num; ++i) if (ud_c_array[i]->type != CRD) _wings_assertions(ud_c_array[i]); sleep(1); /// Give some leeway to post receives, before start bcasting! } void wings_setup_channel_qps_and_recvs(ud_channel_t** ud_c_array, uint16_t ud_c_num, dbit_vector_t* shared_rdy_var, uint16_t worker_lid) { wings_setup_channel_qps_and_recvs_w_shm_key( ud_c_array, ud_c_num, shared_rdy_var, worker_lid, BASE_SHM_KEY); } void wings_print_ud_c_overview(ud_channel_t* ud_c) { printf("%s Channel[%d] %s(%d) --> %s\n", ud_c->is_bcast_channel ? "Bcast" : "Unicast", ud_c->channel_id, ud_c->qp_name, ud_c->qp_id, ud_c->type == REQ ? "REQ" : "RESP"); _wings_print_on_off_toggle(ud_c->is_inlining_enabled, "Inlining"); _wings_print_on_off_toggle(ud_c->max_coalescing, "Coalescing"); _wings_print_on_off_toggle(ud_c->max_pcie_bcast_batch, "Max PCIe batch"); printf("\t\tMax msg size: %dB\n", ud_c->max_msg_size); if (ud_c->type != CRD && !ud_c->is_header_only) printf("\t\tMax pkt size: send = %dB, recv = %dB\n", _wings_ud_send_max_pkt_size(ud_c), _wings_ud_recv_max_pkt_size(ud_c)); else printf( "\t\tMax pkt size: send = 4B (inlined_payload), recv = " "4B(inlined_payload)\n"); printf("\t\tSS granularity: %d\n", ud_c->ss_granularity); printf("\t\tNum remotes: %d\n", ud_c->num_channels - 1); if (ud_c->disable_crd_ctrl) printf("\t\tCredits: OFF \n"); else printf("\t\tCredits: %d (%s) --> %s (%d)\n", ud_c->num_crds_per_channel, ud_c->expl_crd_ctrl ? "Explicit" : "Implicit", ud_c->channel_providing_crds->qp_name, ud_c->channel_providing_crds->qp_id); printf("\t\tSend Q len: %d\n", ud_c->send_q_depth); printf("\t\tRecv Q len: %d\n", ud_c->recv_q_depth); printf("\t\tSend wr len: %d\n", ud_c->max_send_wrs); printf("\t\tRecv wr len: %d\n", ud_c->max_recv_wrs); printf("\t\tSend pkt len: %d\n", ud_c->send_pkt_buff_len); printf("\t\tRecv pkt len: %d\n", ud_c->recv_pkt_buff_len); _wings_print_on_off_toggle(ud_c->enable_stats, "Stats"); _wings_print_on_off_toggle(ud_c->enable_prints, "Prints"); } /* --------------------------------------------------------------------------- ----------------------------------- SETUPs ------------------------------------ ---------------------------------------------------------------------------*/ void _wings_print_on_off_toggle(uint16_t bin_flag, char* str) { if (bin_flag > 1) printf("\t\t%s : %s (%d)\n", str, "\033[1m\033[32mOn\033[0m", bin_flag); else printf("\t\t%s : %s\n", str, bin_flag ? "\033[1m\033[32mOn\033[0m" : "\033[31mOff\033[0m"); } void _wings_ud_channel_crd_init(ud_channel_t* ud_c, char* qp_name, ud_channel_t* linked_channel, uint16_t crds_per_channel, uint16_t num_channels, uint8_t channel_id, uint8_t enable_stats, uint8_t enable_prints) { assert(channel_id < num_channels); _wings_assert_binary(enable_stats); _wings_assert_binary(enable_prints); ud_c->type = CRD; ud_c->qp_name = malloc(sizeof(char) * (strlen(qp_name) + 1)); // TODO make sure to destroy this when destroing a crd_ud_c strcpy(ud_c->qp_name, qp_name); ud_c->channel_id = channel_id; ud_c->num_channels = num_channels; // num_channels include our own channel ud_c->expl_crd_ctrl = 1; ud_c->disable_crd_ctrl = 0; ud_c->is_bcast_channel = 0; ud_c->max_pcie_bcast_batch = 0; ud_c->num_crds_per_channel = crds_per_channel; ud_c->channel_providing_crds = linked_channel; ud_c->enable_stats = enable_stats; ud_c->enable_prints = enable_prints; static_assert(sizeof(wings_crd_t) <= 4, ""); // Credits are always send as inlined_payload <=4B ud_c->max_msg_size = 0; // non inlined_payload size ud_c->small_msg_size = 0; // non inlined_payload size ud_c->max_coalescing = 1; ud_c->no_crds_to_send_per_endpoint = malloc(sizeof(uint16_t) * num_channels); uint16_t remote_channels = (uint16_t)(num_channels - 1); ud_c->is_inlining_enabled = 1; ud_c->credits_per_channels = malloc(sizeof(uint16_t) * (num_channels)); for (int i = 0; i < num_channels; ++i) ud_c->credits_per_channels[i] = 0; ud_c->max_recv_wrs = crds_per_channel * remote_channels; ud_c->max_send_wrs = crds_per_channel * remote_channels; // TODO correct this ud_c->ss_granularity = ud_c->max_send_wrs; ud_c->recv_q_depth = ud_c->max_recv_wrs; ud_c->send_q_depth = (uint16_t)(2 * ud_c->ss_granularity); ud_c->recv_wc = malloc(sizeof(struct ibv_wc) * ud_c->max_recv_wrs); ud_c->recv_pkt_buff_len = ud_c->max_recv_wrs * ud_c->max_coalescing; // TODO: is this correct? ud_c->send_pkt_buff_len = ud_c->max_send_wrs; ud_c->send_pkt_buff = NULL; // malloc(_wings_ud_send_max_pkt_size(ud_c) * // ud_c->send_pkt_buff_len); ud_c->send_mem_region = NULL; ud_c->send_push_ptr = 0; ud_c->recv_push_ptr = 0; ud_c->recv_pull_ptr = -1; ud_c->total_pkts_send = 0; ud_c->stats.ss_completions = 0; ud_c->stats.recv_total_pkts = 0; ud_c->stats.recv_total_msgs = 0; ud_c->stats.send_total_msgs = 0; ud_c->stats.send_total_pkts = 0; ud_c->stats.send_total_pcie_batches = 0; ud_c->stats.no_stalls_due_to_credits = 0; ud_c->remote_qps = malloc(sizeof(qp_info_t) * ud_c->num_channels); // The following are set by the *_init_recv function after the creation of // control block and QPs ud_c->qp = NULL; ud_c->pd = NULL; ud_c->qp_id = 0; ud_c->send_cq = NULL; ud_c->recv_cq = NULL; ud_c->recv_pkt_buff = NULL; // _wings_setup_crd_wr_and_sgl(ud_c, cb); } void _wings_ud_channel_init_recv(ud_channel_t* ud_c, struct hrd_ud_ctrl_blk* cb, uint8_t qp_id, volatile uint8_t* incoming_reqs_ptr) { ud_c->qp_id = qp_id; ud_c->qp = cb->dgram_qp[qp_id]; ud_c->recv_pkt_buff = incoming_reqs_ptr; ud_c->send_cq = cb->dgram_send_cq[ud_c->qp_id]; ud_c->recv_cq = cb->dgram_recv_cq[ud_c->qp_id]; if (ud_c->type != CRD) { ud_c->send_mem_region = ud_c->is_inlining_enabled ? NULL : register_buffer( cb->pd, ud_c->send_pkt_buff, _wings_ud_send_max_pkt_size(ud_c) * ud_c->send_pkt_buff_len); _wings_setup_send_wr_and_sgl(ud_c); _wings_setup_recv_wr_and_sgl(ud_c, cb); } else _wings_setup_crd_wr_and_sgl(ud_c, cb); // post initial receivs /// WARNING try to avoid races of posting initial receives and sending msgs _wings_setup_incoming_buff_and_post_initial_recvs(ud_c); } void _wings_setup_crd_wr_and_sgl(ud_channel_t* ud_c, struct hrd_ud_ctrl_blk* cb) { assert(ud_c->type == CRD); // Credit Send WRs / sgl wings_crd_t crd_tmp; crd_tmp.crd_num = 0; crd_tmp.sender_id = (uint8_t)ud_c->channel_id; ud_c->send_sgl = malloc(sizeof(struct ibv_sge)); ud_c->send_sgl->length = 0; ud_c->send_wr = malloc(sizeof(struct ibv_send_wr) * ud_c->max_send_wrs); for (int i = 0; i < ud_c->max_send_wrs; ++i) { ud_c->send_wr[i].opcode = IBV_WR_SEND_WITH_IMM; ud_c->send_wr[i].num_sge = 0; ud_c->send_wr[i].sg_list = ud_c->send_sgl; ud_c->send_wr[i].wr.ud.remote_qkey = HRD_DEFAULT_QKEY; ud_c->send_wr[i].next = NULL; ud_c->send_wr[i].send_flags = IBV_SEND_INLINE; ud_c->send_wr[i].imm_data = 0; memcpy(&ud_c->send_wr[i].imm_data, &crd_tmp, sizeof(wings_crd_t)); } // Credit Recv WRs / sgl ud_c->recv_sgl = malloc(sizeof(struct ibv_sge)); ud_c->recv_sgl->length = 64; // TODO can we make this zero? ud_c->recv_sgl->lkey = cb->dgram_buf_mr->lkey; ud_c->recv_sgl->addr = (uint64_t)ud_c->recv_pkt_buff; ud_c->recv_wr = malloc(sizeof(struct ibv_recv_wr) * ud_c->max_recv_wrs); for (int i = 0; i < ud_c->max_recv_wrs; ++i) { ud_c->recv_wr[i].num_sge = 1; ud_c->recv_wr[i].sg_list = ud_c->recv_sgl; } } void _wings_setup_send_wr_and_sgl(ud_channel_t* ud_c) { assert(ud_c->type != CRD); wings_hdr_only_t hdr_only_tmp; hdr_only_tmp.sender_id = (uint8_t)ud_c->channel_id; memset(hdr_only_tmp.inlined_payload, 0, 3 * sizeof(uint8_t)); if (ud_c->is_bcast_channel) { // Send bcast WRs uint16_t remote_channels = (uint16_t)(ud_c->num_channels - 1); uint16_t max_msgs_in_pcie_batch = (uint16_t)(ud_c->max_pcie_bcast_batch * remote_channels); ud_c->send_wr = malloc(sizeof(struct ibv_send_wr) * max_msgs_in_pcie_batch); ud_c->send_sgl = malloc(sizeof(struct ibv_sge) * (ud_c->is_header_only == 1 ? 1 : ud_c->max_pcie_bcast_batch)); if (ud_c->is_header_only) ud_c->send_sgl->length = 0; else for (int i = 0; i < ud_c->max_pcie_bcast_batch; ++i) ud_c->send_sgl[i].length = _wings_ud_send_max_pkt_size(ud_c); for (int i = 0; i < max_msgs_in_pcie_batch; ++i) { int sgl_index = i / remote_channels; int i_mod_bcast = i % remote_channels; uint16_t rm_qp_id; if (i_mod_bcast < ud_c->channel_id) rm_qp_id = (uint16_t)i_mod_bcast; else rm_qp_id = (uint16_t)((i_mod_bcast + 1) % ud_c->num_channels); ud_c->send_wr[i].wr.ud.remote_qkey = HRD_DEFAULT_QKEY; ud_c->send_wr[i].wr.ud.ah = ud_c->remote_qps[rm_qp_id].ah; ud_c->send_wr[i].wr.ud.remote_qpn = ud_c->remote_qps[rm_qp_id].qpn; if (!ud_c->is_header_only) { ud_c->send_wr[i].num_sge = 1; ud_c->send_wr[i].opcode = IBV_WR_SEND; /// Attention!! there is no immediate here ud_c->send_wr[i].sg_list = &ud_c->send_sgl[sgl_index]; } else { ud_c->send_wr[i].next = NULL; ud_c->send_wr[i].imm_data = 0; ud_c->send_wr[i].num_sge = 0; ud_c->send_wr[i].sg_list = ud_c->send_sgl; ud_c->send_wr[i].opcode = IBV_WR_SEND_WITH_IMM; memcpy(&ud_c->send_wr[i].imm_data, &hdr_only_tmp, sizeof(wings_hdr_only_t)); } if (!ud_c->is_inlining_enabled) { ud_c->send_wr[i].send_flags = 0; ud_c->send_sgl[sgl_index].lkey = ud_c->send_mem_region->lkey; } else ud_c->send_wr[i].send_flags = IBV_SEND_INLINE; ud_c->send_wr[i].next = (i_mod_bcast == remote_channels - 1) ? NULL : &ud_c->send_wr[i + 1]; } } else { // Send unicast WRs ud_c->send_wr = malloc(sizeof(struct ibv_send_wr) * ud_c->max_send_wrs); ud_c->send_sgl = malloc(sizeof(struct ibv_sge) * (ud_c->is_header_only ? 1 : ud_c->max_send_wrs)); for (int i = 0; i < ud_c->max_send_wrs; ++i) { ud_c->send_wr[i].wr.ud.remote_qkey = HRD_DEFAULT_QKEY; if (!ud_c->is_header_only) { // ud_c->send_sgl[i].length = // sizeof(wings_pkt_t) + _wings_ud_recv_max_pkt_size(ud_c); ud_c->send_sgl[i].length = _wings_ud_send_max_pkt_size(ud_c); ud_c->send_wr[i].num_sge = 1; ud_c->send_wr[i].opcode = IBV_WR_SEND; /// Attention!! there is no immediate here ud_c->send_wr[i].sg_list = &ud_c->send_sgl[i]; } else { ud_c->send_sgl->length = 0; ud_c->send_wr[i].next = NULL; ud_c->send_wr[i].imm_data = 0; ud_c->send_wr[i].num_sge = 0; ud_c->send_wr[i].sg_list = ud_c->send_sgl; ud_c->send_wr[i].opcode = IBV_WR_SEND_WITH_IMM; memcpy(&ud_c->send_wr[i].imm_data, &hdr_only_tmp, sizeof(wings_hdr_only_t)); } if (!ud_c->is_inlining_enabled) { ud_c->send_wr[i].send_flags = 0; ud_c->send_sgl[i].lkey = ud_c->send_mem_region->lkey; } else ud_c->send_wr[i].send_flags = IBV_SEND_INLINE; } } } void _wings_setup_recv_wr_and_sgl(ud_channel_t* ud_c, struct hrd_ud_ctrl_blk* cb) { assert(ud_c->type != CRD); ud_c->recv_sgl = malloc(sizeof(struct ibv_sge) * (ud_c->is_header_only == 1 ? 1 : ud_c->max_recv_wrs)); if (ud_c->is_header_only) { ud_c->recv_sgl->length = 64; // TODO can we make this zero? ud_c->recv_sgl->lkey = cb->dgram_buf_mr->lkey; ud_c->recv_sgl->addr = (uint64_t)ud_c->recv_pkt_buff; } ud_c->recv_wr = malloc(sizeof(struct ibv_recv_wr) * ud_c->max_recv_wrs); for (int i = 0; i < ud_c->max_recv_wrs; i++) { if (!ud_c->is_header_only) { ud_c->recv_sgl[i].lkey = cb->dgram_buf_mr->lkey; ud_c->recv_sgl[i].length = _wings_ud_recv_max_pkt_size(ud_c); } ud_c->recv_wr[i].num_sge = 1; ud_c->recv_wr[i].next = (i == ud_c->max_recv_wrs - 1) ? NULL : &ud_c->recv_wr[i + 1]; ud_c->recv_wr[i].sg_list = ud_c->is_header_only == 1 ? ud_c->recv_sgl : &ud_c->recv_sgl[i]; } } void _wings_setup_incoming_buff_and_post_initial_recvs(ud_channel_t* ud_c) { if (ud_c->is_header_only == 0 && ud_c->type != CRD) { // init recv buffs as empty (not need for CRD since CRD msgs are // --inlined_payload-- header-only) for (uint16_t i = 0; i < ud_c->send_pkt_buff_len; ++i) _wings_get_nth_pkt_ptr_from_send_buff(ud_c, i)->req_num = 0; for (uint16_t i = 0; i < ud_c->recv_pkt_buff_len; ++i) _wings_get_nth_pkt_ptr_from_recv_buff(ud_c, i)->pkt.req_num = 0; } if (WINGS_ENABLE_POST_RECV_PRINTS && ud_c->enable_prints) colored_printf(YELLOW, "vvv Post Initial Receives: %s %d\n", ud_c->qp_name, ud_c->max_recv_wrs); if (ud_c->is_header_only == 0 && ud_c->type != CRD) _wings_post_recvs(ud_c, ud_c->max_recv_wrs); else _wings_post_hdr_only_recvs(ud_c, ud_c->max_recv_wrs); } /* --------------------------------------------------------------------------- -------------------------------- QP Sharing ------------------------------- --------------------------------------------------------------------------- */ unsigned long _wings_simple_hash(unsigned char* str) { int c; unsigned long hash = 5381; while (c = *str++) hash = ((hash << 5) + hash) + c; // hash * 33 + c return hash; } void _wings_get_remote_qp(ud_channel_t* ud_c, uint8_t endpoint_id) { int ib_port_index = 0; int local_port_i = ib_port_index; char qp_global_name[HRD_QP_NAME_SIZE]; struct hrd_qp_attr* qp; //= malloc(sizeof(struct hrd_qp_attr*) * max_remote_channels); sprintf(qp_global_name, "%lu-%d", _wings_simple_hash((unsigned char*)ud_c->qp_name), endpoint_id); // Get the UD queue pair for the ith machine qp = NULL; // yellow_printf("Looking for %s\n", qp_global_name); while (qp == NULL) { qp = hrd_get_published_qp(qp_global_name); if (qp == NULL) usleep(200000); } // green_printf("Found %s\n", qp_global_name); struct ibv_ah_attr ah_attr = { //-----INFINIBAND---------- .is_global = 0, .dlid = (uint16_t)qp->lid, .sl = (uint8_t)qp->sl, .src_path_bits = 0, /* port_num (> 1): device-local port for responses to this worker */ .port_num = (uint8_t)(local_port_i + 1), }; if (is_roce == 1) { //-----RoCE---------- ah_attr.is_global = 1; ah_attr.dlid = 0; ah_attr.grh.dgid.global.interface_id = qp->gid_global_interface_id; ah_attr.grh.dgid.global.subnet_prefix = qp->gid_global_subnet_prefix; ah_attr.grh.sgid_index = 0; ah_attr.grh.hop_limit = 1; } ud_c->remote_qps[endpoint_id].qpn = (uint32_t)qp->qpn; ud_c->remote_qps[endpoint_id].ah = ibv_create_ah(ud_c->pd, &ah_attr); assert(ud_c->remote_qps[endpoint_id].ah != NULL); } void _wings_get_remote_qps(ud_channel_t** ud_c_array, uint16_t ud_c_num) { uint16_t max_remote_channels = 0; for (int i = 0; i < ud_c_num; ++i) if (ud_c_array[i]->num_channels > max_remote_channels) max_remote_channels = ud_c_array[i]->num_channels; for (int i = 0; i < ud_c_num; ++i) for (int j = 0; j < ud_c_array[i]->num_channels; ++j) { if (j == ud_c_array[i]->channel_id) continue; // skip the local channel id _wings_get_remote_qp(ud_c_array[i], (uint8_t)j); } } void _wings_share_qp_info_via_memcached(ud_channel_t** ud_c_array, uint16_t ud_c_num, dbit_vector_t* shared_rdy_var, int worker_lid, struct hrd_ud_ctrl_blk* cb) { for (int i = 0; i < ud_c_num; i++) { char qp_global_name[HRD_QP_NAME_SIZE]; sprintf(qp_global_name, "%lu-%d", _wings_simple_hash((unsigned char*)ud_c_array[i]->qp_name), ud_c_array[i]->channel_id); hrd_publish_dgram_qp(cb, i, qp_global_name, WORKER_SL); // yellow_printf("Publishing: %s (qpname: %s)\n", qp_global_name, // ud_c_array[i]->qp_name); } _wings_get_remote_qps(ud_c_array, ud_c_num); if (shared_rdy_var == NULL) { assert(worker_lid == 0); return; } assert(dbv_bit_get(*shared_rdy_var, worker_lid) == 0); dbv_bit_set(shared_rdy_var, (uint8_t)worker_lid); // WARNING (global) shared_rdy_var which is used as a g_share_qs_barrier must // be len of num_workers + 1 while (!dbv_is_all_set(*shared_rdy_var)) usleep(20000); assert(dbv_is_all_set(*shared_rdy_var)); } void wings_reconfigure_wrs_ah(ud_channel_t* ud_c, uint8_t endpoint_id) { _wings_get_remote_qp(ud_c, endpoint_id); if (!ud_c->disable_crd_ctrl) _wings_get_remote_qp(ud_c->channel_providing_crds, endpoint_id); /// TODO WARNING: this is untested and assumes that we always send to everyone if (ud_c->is_bcast_channel) { uint16_t remote_channels = (uint16_t)(ud_c->num_channels - 1); uint16_t max_msgs_in_pcie_batch = (uint16_t)(ud_c->max_pcie_bcast_batch * remote_channels); for (int i = 0; i < max_msgs_in_pcie_batch; ++i) { int i_mod_bcast = i % remote_channels; uint16_t rm_qp_id; if (i_mod_bcast < ud_c->channel_id) rm_qp_id = (uint16_t)i_mod_bcast; else rm_qp_id = (uint16_t)((i_mod_bcast + 1) % ud_c->num_channels); ud_c->send_wr[i].wr.ud.ah = ud_c->remote_qps[rm_qp_id].ah; ud_c->send_wr[i].wr.ud.remote_qpn = ud_c->remote_qps[rm_qp_id].qpn; } } } ================================================ FILE: tla/Hermes.tla ================================================ ------------------------------- MODULE Hermes ------------------------------- EXTENDS Integers, FiniteSets CONSTANTS H_NODES, H_MAX_VERSION VARIABLES msgs, nodeTS, nodeState, nodeRcvedAcks, nodeLastWriter, nodeLastWriteTS, nodeWriteEpochID, aliveNodes, epochID \* all Hermes (+ environment) variables hvars == << msgs, nodeTS, nodeState, nodeRcvedAcks, nodeLastWriter, nodeLastWriteTS, nodeWriteEpochID, aliveNodes, epochID >> ------------------------------------------------------------------------------------- HMessage == \* Messages exchanged by the Protocol [type: {"INV", "ACK"}, sender : H_NODES, epochID : 0..(Cardinality(H_NODES) - 1), version : 0..H_MAX_VERSION, tieBreaker: H_NODES] \* Note that we need not send Value w/ INVs, timestamp suffice to check consistency \union [type: {"VAL"}, \* optimization: epochID is not required for VALs \* epochID : 0..(Cardinality(H_NODES) - 1), version : 0..H_MAX_VERSION, tieBreaker: H_NODES] HTypeOK == \* The type correctness invariant /\ msgs \subseteq HMessage /\ \A n \in H_NODES: nodeRcvedAcks[n] \subseteq (H_NODES \ {n}) /\ nodeLastWriter \in [H_NODES -> H_NODES] /\ nodeLastWriteTS \in [H_NODES -> [version : 0..H_MAX_VERSION, tieBreaker: H_NODES ]] /\ nodeTS \in [H_NODES -> [version : 0..H_MAX_VERSION, tieBreaker: H_NODES ]] /\ nodeState \in [H_NODES -> {"valid", "invalid", "invalid_write", "write", "replay"}] \* membership and epoch id related /\ aliveNodes \subseteq H_NODES /\ epochID \in 0..(Cardinality(H_NODES) - 1) /\ nodeWriteEpochID \in [H_NODES -> 0..(Cardinality(H_NODES) - 1)] \* The consistent invariant: all alive nodes in valid state should have the same value / TS HConsistent == \A k,s \in aliveNodes: \/ nodeState[k] /= "valid" \/ nodeState[s] /= "valid" \/ nodeTS[k] = nodeTS[s] HInit == \* The initial predicate /\ msgs = {} \* membership and epoch id related /\ epochID = 0 /\ aliveNodes = H_NODES /\ nodeWriteEpochID = [n \in H_NODES |-> 0] \* Init rest per node replica metadata /\ nodeRcvedAcks = [n \in H_NODES |-> {}] /\ nodeState = [n \in H_NODES |-> "valid"] /\ nodeLastWriter = [n \in H_NODES |-> CHOOSE k \in H_NODES: \A m \in H_NODES: k <= m] /\ nodeTS = [n \in H_NODES |-> [version |-> 0, tieBreaker |-> CHOOSE k \in H_NODES: \A m \in H_NODES: k <= m]] /\ nodeLastWriteTS = [n \in H_NODES |-> [version |-> 0, tieBreaker |-> CHOOSE k \in H_NODES: \A m \in H_NODES: k <= m]] ------------------------------------------------------------------------------------- \* A buffer maintaining all network messages. Messages are only appended to this variable (not \* removed once delivered) intentionally to check protocols tolerance in dublicates and reorderings send(m) == msgs' = msgs \union {m} \* Check if all acknowledgments for a write have been received receivedAllAcks(n) == (aliveNodes \ {n}) \subseteq nodeRcvedAcks[n] equalTS(v1,tb1,v2,tb2) == \* Timestamp equality /\ v1 = v2 /\ tb1 = tb2 greaterTS(v1,tb1,v2,tb2) == \* Timestamp comparison \/ v1 > v2 \/ /\ v1 = v2 /\ tb1 > tb2 isAlive(n) == n \in aliveNodes nodeFailure(n) == \* Emulate a node failure \* Make sure that there are atleast 3 alive nodes before killing a node /\ Cardinality(aliveNodes) > 2 /\ nodeRcvedAcks' = [k \in H_NODES |-> {}] /\ aliveNodes' = aliveNodes \ {n} /\ epochID' = epochID + 1 /\ UNCHANGED <> h_upd_not_aliveNodes == /\ UNCHANGED <> h_upd_aliveNodes == /\ UNCHANGED <> h_upd_nothing == /\ h_upd_not_aliveNodes /\ h_upd_aliveNodes ------------------------------------------------------------------------------------- h_upd_state(n, newVersion, newTieBreaker, newState, newAcks) == /\ nodeLastWriter' = [nodeLastWriter EXCEPT ![n] = n] /\ nodeRcvedAcks' = [nodeRcvedAcks EXCEPT ![n] = newAcks] /\ nodeState' = [nodeState EXCEPT ![n] = newState] /\ nodeWriteEpochID' = [nodeWriteEpochID EXCEPT ![n] = epochID] \* we always use the latest epochID /\ nodeTS' = [nodeTS EXCEPT ![n].version = newVersion, ![n].tieBreaker = newTieBreaker] /\ nodeLastWriteTS' = [nodeLastWriteTS EXCEPT ![n].version = newVersion, ![n].tieBreaker = newTieBreaker] h_send_inv_or_ack(n, newVersion, newTieBreaker, msgType) == /\ send([type |-> msgType, epochID |-> epochID, \* we always use the latest epochID sender |-> n, version |-> newVersion, tieBreaker |-> newTieBreaker]) h_actions_for_upd(n, newVersion, newTieBreaker, newState, newAcks) == \* Execute a write /\ h_upd_state(n, newVersion, newTieBreaker, newState, newAcks) /\ h_send_inv_or_ack(n, newVersion, newTieBreaker, "INV") /\ UNCHANGED <> h_actions_for_upd_replay(n, acks) == \* Apply a write-replay using same TS (version, tie-breaker) \* and either reset acks or keep already gathered acks /\ h_actions_for_upd(n, nodeTS[n].version, nodeTS[n].tieBreaker, "replay", acks) ------------------------------------------------------------------------------------- HRead(n) == \* Execute a read /\ nodeState[n] = "valid" /\ h_upd_nothing HWrite(n) == \* Execute a write \* /\ nodeState[n] \in {"valid", "invalid"} \* writes in invalid state are also supported as an optimization /\ nodeState[n] \in {"valid"} /\ nodeTS[n].version < H_MAX_VERSION \* Only to configurably terminate the model checking /\ h_actions_for_upd(n, nodeTS[n].version + 1, n, "write", {}) HCoordWriteReplay(n) == \* Execute a write-replay after a membership re-config /\ nodeState[n] \in {"write", "replay"} /\ nodeWriteEpochID[n] < epochID /\ ~receivedAllAcks(n) \* optimization to not replay when we have gathered acks from all alive /\ h_actions_for_upd_replay(n, nodeRcvedAcks[n]) HRcvAck(n) == \* Process a received acknowledment \E m \in msgs: /\ m.type = "ACK" /\ m.epochID = epochID /\ m.sender /= n /\ m.sender \notin nodeRcvedAcks[n] /\ equalTS(m.version, m.tieBreaker, nodeLastWriteTS[n].version, nodeLastWriteTS[n].tieBreaker) /\ nodeState[n] \in {"write", "invalid_write", "replay"} /\ nodeRcvedAcks' = [nodeRcvedAcks EXCEPT ![n] = nodeRcvedAcks[n] \union {m.sender}] /\ UNCHANGED <> HSendVals(n) == \* Send validations once acknowledments are received from all alive nodes /\ nodeState[n] \in {"write", "replay"} /\ receivedAllAcks(n) /\ nodeState' = [nodeState EXCEPT![n] = "valid"] /\ send([type |-> "VAL", version |-> nodeTS[n].version, tieBreaker |-> nodeTS[n].tieBreaker]) /\ UNCHANGED <> HCoordinatorActions(n) == \* Actions of a read/write coordinator \/ HRead(n) \/ HCoordWriteReplay(n) \* After failures \/ HWrite(n) \/ HRcvAck(n) \/ HSendVals(n) ------------------------------------------------------------------------------------- HRcvInv(n) == \* Process a received invalidation \E m \in msgs: /\ m.type = "INV" /\ m.epochID = epochID /\ m.sender /= n \* always acknowledge a received invalidation (irrelevant to the timestamp) /\ send([type |-> "ACK", sender |-> n, epochID |-> epochID, version |-> m.version, tieBreaker |-> m.tieBreaker]) /\ IF greaterTS(m.version, m.tieBreaker, nodeTS[n].version, nodeTS[n].tieBreaker) THEN /\ nodeLastWriter' = [nodeLastWriter EXCEPT ![n] = m.sender] /\ nodeTS' = [nodeTS EXCEPT ![n].version = m.version, ![n].tieBreaker = m.tieBreaker] /\ IF nodeState[n] \in {"valid", "invalid", "replay"} THEN nodeState' = [nodeState EXCEPT ![n] = "invalid"] ELSE nodeState' = [nodeState EXCEPT ![n] = "invalid_write"] ELSE UNCHANGED <> /\ UNCHANGED <> HRcvVal(n) == \* Process a received validation \E m \in msgs: /\ nodeState[n] /= "valid" /\ m.type = "VAL" /\ equalTS(m.version, m.tieBreaker, nodeTS[n].version, nodeTS[n].tieBreaker) /\ nodeState' = [nodeState EXCEPT ![n] = "valid"] /\ UNCHANGED <> HFollowerWriteReplay(n) == \* Execute a write-replay when coordinator failed /\ nodeState[n] \in {"invalid", "invalid_write"} /\ ~isAlive(nodeLastWriter[n]) /\ h_actions_for_upd_replay(n, {}) HFollowerActions(n) == \* Actions of a write follower \/ HRcvInv(n) \/ HFollowerWriteReplay(n) \/ HRcvVal(n) ------------------------------------------------------------------------------------- HNext == \* Hermes (read/write) protocol (Coordinator and Follower actions) + failures \E n \in aliveNodes: \/ HFollowerActions(n) \/ HCoordinatorActions(n) \/ nodeFailure(n) H_Spec == HInit /\ [][HNext]_hvars THEOREM H_Spec =>([]HTypeOK) /\ ([]HConsistent) ============================================================================= ================================================ FILE: tla/HermesRMWs.tla ================================================ ------------------------------- MODULE HermesRMWs ------------------------------- EXTENDS Hermes VARIABLES Rmsgs, nodeFlagRMW, committedRMWs, committedWrites \* all Hermes (+ environment, + RMW) variables hrvars == << msgs, nodeTS, nodeState, nodeRcvedAcks, nodeLastWriter, nodeLastWriteTS, nodeWriteEpochID, aliveNodes, epochID, Rmsgs, nodeFlagRMW, committedRMWs, committedWrites >> ------------------------------------------------------------------------------------- HRMessage == \* Invalidation msgs exchanged by the Hermes Protocol w/ RMWs [type: {"RINV"}, flagRMW : {0,1}, \* RMW change epochID : 0..(Cardinality(H_NODES) - 1), sender : H_NODES, version : 0..H_MAX_VERSION, tieBreaker: H_NODES] HRts == [version: 0..H_MAX_VERSION, tieBreaker: H_NODES] HRTypeOK == \* The type correctness invariant /\ HTypeOK /\ Rmsgs \subseteq HRMessage /\ nodeFlagRMW \in [H_NODES -> {0,1}] /\ committedRMWs \subseteq HRts /\ committedWrites \subseteq HRts HRSemanticsRMW == \* The invariant that an we cannot have two operations committed \* with same versions (i.e., that read the same value unless they are both writes) /\ \A x \in committedRMWs: \A y \in committedWrites: /\ x.version /= y.version /\ x.version /= y.version - 1 /\ \A x,y \in committedRMWs: \/ x.version /= y.version \/ x.tieBreaker = y.tieBreaker HRInit == \* The initial predicate /\ HInit /\ Rmsgs = {} /\ committedRMWs = {} /\ committedWrites = {} /\ nodeFlagRMW = [n \in H_NODES |-> 0] \* RMW change ------------------------------------------------------------------------------------- \* A buffer maintaining all Invalidation messages. Messages are only appended to this variable (not \* removed once delivered) intentionally to check protocols tolerance in dublicates and reorderings HRsend(m) == Rmsgs' = Rmsgs \union {m} hr_upd_nothing == /\ UNCHANGED <> hr_completeWrite(ver, tieB) == /\ committedWrites' = committedWrites \union {[version |-> ver, tieBreaker |-> tieB]} /\ UNCHANGED <> hr_completeRMW(ver, tieB) == /\ committedRMWs' = committedRMWs \union {[version |-> ver, tieBreaker |-> tieB]} /\ UNCHANGED <> ------------------------------------------------------------------------------------- \* Helper functions hr_upd_state(n, newVersion, newTieBreaker, newState, newAcks, flagRMW) == /\ nodeFlagRMW' = [nodeFlagRMW EXCEPT ![n] = flagRMW] \* RMW change /\ h_upd_state(n, newVersion, newTieBreaker, newState, newAcks) hr_send_inv(n, newVersion, newTieBreaker, flagRMW) == /\ HRsend([type |-> "RINV", epochID |-> epochID, \* we always use the latest epochID flagRMW |-> flagRMW, \* RMW change sender |-> n, version |-> newVersion, tieBreaker |-> newTieBreaker]) hr_actions_for_upd(n, newVersion, newTieBreaker, newState, newAcks, flagRMW) == \* Execute a write /\ hr_upd_state(n, newVersion, newTieBreaker, newState, newAcks, flagRMW) /\ hr_send_inv(n, newVersion, newTieBreaker, flagRMW) /\ UNCHANGED <> hr_actions_for_upd_replay(n, acks) == \* Apply a write-replay using same TS (version, Tie Breaker) \* and either reset acks or keep already gathered acks /\ hr_actions_for_upd(n, nodeTS[n].version, nodeTS[n].tieBreaker, "replay", acks, nodeFlagRMW[n]) ------------------------------------------------------------------------------------- \* Coordinator functions HRWrite(n) == \* Execute a write \* /\ nodeState[n] \in {"valid", "invalid"} \* writes in invalid state are also supported as an optimization /\ nodeState[n] = "valid" /\ nodeTS[n].version + 2 <= H_MAX_VERSION \* Only to configurably terminate the model checking /\ hr_actions_for_upd(n, nodeTS[n].version + 2, n, "write", {}, 0) HRRMW(n) == \* Execute an RMW /\ nodeState[n] = "valid" /\ nodeTS[n].version + 1 <= H_MAX_VERSION \* Only to configurably terminate the model checking /\ hr_actions_for_upd(n, nodeTS[n].version + 1, n, "write", {}, 1) HRWriteReplay(n) == \* Execute a write-replay /\ nodeState[n] \in {"write", "replay"} /\ nodeWriteEpochID[n] < epochID /\ ~receivedAllAcks(n) \* optimization to not replay when we have gathered acks from all alive /\ nodeFlagRMW[n] = 0 /\ hr_actions_for_upd_replay(n, nodeRcvedAcks[n]) HRRMWReplay(n) == \* Execute an RMW-replay /\ nodeState[n] \in {"write", "replay"} /\ nodeWriteEpochID[n] < epochID /\ ~receivedAllAcks(n) \* optimization to not replay when we have gathered acks from all alive /\ nodeFlagRMW[n] = 1 /\ hr_actions_for_upd_replay(n, {}) \* Keep the HRead, HRcvAck and HSendVals the same as Hermes w/o RMWs HRRead(n) == /\ HRead(n) /\ hr_upd_nothing HRRcvAck(n) == /\ HRcvAck(n) /\ hr_upd_nothing HRSendValsRMW(n) == /\ nodeFlagRMW[n] = 1 /\ HSendVals(n) /\ hr_completeRMW(nodeTS[n].version, nodeTS[n].tieBreaker) HRSendValsWrite(n) == /\ nodeFlagRMW[n] = 0 /\ HSendVals(n) /\ hr_completeWrite(nodeTS[n].version, nodeTS[n].tieBreaker) HRCoordinatorActions(n) == \* Actions of a read/write/RMW coordinator \/ HRRead(n) \/ HRRMWReplay(n) \/ HRWriteReplay(n) \/ HRWrite(n) \/ HRRMW(n) \/ HRRcvAck(n) \/ HRSendValsRMW(n) \/ HRSendValsWrite(n) ------------------------------------------------------------------------------------- \* Follower functions hr_upd_state_greater_inv(n) == IF nodeState[n] \in {"valid", "invalid", "replay"} THEN nodeState' = [nodeState EXCEPT ![n] = "invalid"] ELSE IF nodeState[n] \in {"write", "invalid_write"} /\ nodeFlagRMW[n] = 0 THEN nodeState' = [nodeState EXCEPT ![n] = "invalid_write"] ELSE \* nodeState[n] \in {"write"} /\ nodeFlagRMW[n] = 1 nodeState' = [nodeState EXCEPT ![n] = "invalid"] HRRcvWriteInv(n) == \* Process a received invalidation for a write \E m \in Rmsgs: /\ m.type = "RINV" /\ m.epochID = epochID /\ m.sender /= n /\ m.flagRMW = 0 \* RMW change \* always acknowledge a received invalidation (irrelevant to the timestamp) /\ h_send_inv_or_ack(n, m.version, m.tieBreaker, "ACK") /\ IF greaterTS(m.version, m.tieBreaker, nodeTS[n].version, nodeTS[n].tieBreaker) THEN /\ nodeLastWriter' = [nodeLastWriter EXCEPT ![n] = m.sender] /\ nodeFlagRMW' = [nodeFlagRMW EXCEPT ![n] = m.flagRMW] \* RMW change /\ nodeTS' = [nodeTS EXCEPT ![n].version = m.version, ![n].tieBreaker = m.tieBreaker] /\ hr_upd_state_greater_inv(n) ELSE /\ UNCHANGED <> /\ UNCHANGED <> HRRcvRMWInv(n) == \* Process a received invalidation for a write \E m \in Rmsgs: /\ m.type = "RINV" /\ m.epochID = epochID /\ m.sender /= n /\ m.flagRMW = 1 /\ IF greaterTS(m.version, m.tieBreaker, nodeTS[n].version, nodeTS[n].tieBreaker) THEN /\ nodeLastWriter' = [nodeLastWriter EXCEPT ![n] = m.sender] /\ nodeFlagRMW' = [nodeFlagRMW EXCEPT ![n] = m.flagRMW] \* RMW change /\ nodeTS' = [nodeTS EXCEPT ![n].version = m.version, ![n].tieBreaker = m.tieBreaker] \* acknowledge a received invalidation (w/ greater timestamp) /\ h_send_inv_or_ack(n, m.version, m.tieBreaker, "ACK") /\ hr_upd_state_greater_inv(n) /\ UNCHANGED <> ELSE IF equalTS(m.version, m.tieBreaker, nodeTS[n].version, nodeTS[n].tieBreaker) THEN \* acknowledge a received invalidation (w/ equal timestamp) /\ h_send_inv_or_ack(n, m.version, m.tieBreaker, "ACK") /\ UNCHANGED <> ELSE \* smaller TS /\ hr_send_inv(n, nodeTS[n].version, nodeTS[n].tieBreaker, nodeFlagRMW[n]) /\ UNCHANGED <> /\ UNCHANGED <> \* Keep the HRcvVals the same as Hermes w/o RMWs HRRcvVal(n) == /\ HRcvVal(n) /\ hr_upd_nothing HRFollowerWriteReplay(n) == \* Execute a write-replay when coordinator failed /\ nodeState[n] \in {"invalid", "invalid_write"} /\ ~isAlive(nodeLastWriter[n]) /\ hr_actions_for_upd_replay(n, {}) HRFollowerActions(n) == \* Actions of a write follower \/ HRFollowerWriteReplay(n) \/ HRRcvWriteInv(n) \/ HRRcvRMWInv(n) \/ HRRcvVal(n) ------------------------------------------------------------------------------------- HRNodeFailure(n) == /\ nodeFailure(n) /\ hr_upd_nothing HRNext == \* Hermes (read,write RMWs) protocol (Coordinator and Follower actions) + failures \E n \in aliveNodes: \/ HRFollowerActions(n) \/ HRCoordinatorActions(n) \/ HRNodeFailure(n) \* Hermes w/ RMW Spec HRSpec == HRInit /\ [][HRNext]_hrvars THEOREM HRSpec =>([]HRTypeOK) /\ ([]HConsistent) /\ ([]HRSemanticsRMW) \* A hacky way to run Hermes w/o RMWs from the same model HSpec == HRInit /\ [][HNext /\ hr_upd_nothing]_hrvars THEOREM HSpec =>([]HRTypeOK) /\ ([]HConsistent) ============================================================================= ================================================ FILE: tla/README.md ================================================ # Hermes-Protocol TLA spec - Hermes: fault-tolerant replication protocol with strong consistency and high performance --- Warning protocol-actions png contains some optimizations over the Hermes protocol presented in the paper such as issuing writes while being in Invalid state.