Repository: vnmakarov/mum-hash
Branch: master
Commit: 31b2cc31970c
Files: 70
Total size: 760.6 KB

Directory structure:
gitextract_28zcd061/

├── .clang-format
├── ChangeLog
├── README.md
├── benchmarks/
│   ├── City.cpp
│   ├── City.h
│   ├── SpookyV2.cpp
│   ├── SpookyV2.h
│   ├── bbs-prng.h
│   ├── bench-crypto.c
│   ├── bench-crypto.sh
│   ├── bench-prng.c
│   ├── bench-prng.sh
│   ├── bench.c
│   ├── bench.sh
│   ├── blake2-config.h
│   ├── blake2-impl.h
│   ├── blake2.h
│   ├── blake2b-load-sse2.h
│   ├── blake2b-load-sse41.h
│   ├── blake2b-round.h
│   ├── blake2b.c
│   ├── byte_order.c
│   ├── byte_order.h
│   ├── chacha-prng.h
│   ├── gen-table.rb
│   ├── meow_hash.h
│   ├── meow_intrinsics.h
│   ├── metrohash64.cpp
│   ├── metrohash64.h
│   ├── mum512-prng.h
│   ├── platform.h
│   ├── rapidhash.h
│   ├── sha3.c
│   ├── sha3.h
│   ├── sha512.c
│   ├── sha512.h
│   ├── sip24-prng.h
│   ├── siphash24.c
│   ├── splitmix64.c
│   ├── t1ha/
│   │   ├── src/
│   │   │   ├── t1ha0.c
│   │   │   ├── t1ha0_ia32aes_a.h
│   │   │   ├── t1ha0_ia32aes_avx.c
│   │   │   ├── t1ha0_ia32aes_avx2.c
│   │   │   ├── t1ha0_ia32aes_b.h
│   │   │   ├── t1ha0_ia32aes_noavx.c
│   │   │   ├── t1ha0_selfcheck.c
│   │   │   ├── t1ha1.c
│   │   │   ├── t1ha1_selfcheck.c
│   │   │   ├── t1ha2.c
│   │   │   ├── t1ha2_selfcheck.c
│   │   │   ├── t1ha_bits.h
│   │   │   ├── t1ha_selfcheck.c
│   │   │   ├── t1ha_selfcheck.h
│   │   │   └── t1ha_selfcheck_all.c
│   │   └── t1ha.h
│   ├── ustd.h
│   ├── xoroshiro128plus.c
│   ├── xoroshiro128starstar.c
│   ├── xoseed.c
│   ├── xoshiro256plus.c
│   ├── xoshiro256starstar.c
│   ├── xoshiro512plus.c
│   ├── xoshiro512starstar.c
│   ├── xxh3.h
│   ├── xxhash.c
│   └── xxhash.h
├── mum-prng.h
├── mum.h
├── mum512.h
└── vmum.h

================================================
FILE CONTENTS
================================================

================================================
FILE: .clang-format
================================================
BasedOnStyle: google
SpaceBeforeParens: Always
IndentCaseLabels: false
AllowShortIfStatementsOnASingleLine: true
AllowShortLoopsOnASingleLine: true
SpaceAfterCStyleCast: true
PointerAlignment: Right
BreakBeforeBinaryOperators: All
ConstructorInitializerIndentWidth: 2
ContinuationIndentWidth: 2
PenaltyBreakBeforeFirstCallParameter: 10000
SortIncludes: false
BreakStringLiterals: true
BreakBeforeTernaryOperators: true
AllowShortCaseLabelsOnASingleLine: true
#AllowShortEnumsOnASingleLine: true
ColumnLimit: 100
MaxEmptyLinesToKeep: 1
#StatementMacros: [ 'REP2', 'REP3', 'REP4', 'REP5', 'REP6', 'REP7', 'REP8' ]
#TypenameMacros: [ 'VARR', 'DLIST', 'HTAB' ]


================================================
FILE: ChangeLog
================================================
2018-11-02  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* README.md: Add update about mum-prng.  Correct typo for
	xoshiro512** result.
	* mum-prng.h (_mum_prng_state): Change avx2_support onto
	update_func.
	(_mum_prng_setup_avx2): Setup update_func.  Move below.
	(_start_mum_prng): Setup update_func for non x86-64.  Move below.
	(init_mum_prng, set_mum_prng_seed): Move below.
	(_mum_prng_update_avx2, _mum_prng_update): Update a state word
	from the next word.
	(get_mum_prn): Simplify.
	* src/bench-prng: Use env. variable MUM_ONLY.

2018-10-31  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* README.md: Minor editions.  Add RAND failure on practrand.

2018-10-31  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* README.md: Update for PRNGs and MUM-512.
	* mum-512.h (_MC_FRESH_GCC, _mc_hash_avx2): Removed.
	(_mc_hash_default): Don't check _MC_UNALIGNED_ACCESS.
	(mum512_keyed_hash): Remove avx2 probe.  Use _mc_hash_aligned.
	* mum-prng.h (_mum_avx2): New.
	(_mum_prng_update_avx2): Use it.
	* src/bench-prng: Add new xo[ro]shiro tests.
	* src/bench-prng.c: Ditto.  Add code for PRNs output.
	* src/xoroshiro128plus.c, src/xoroshiro128starstar.c:  New files.
	* src/xoshiro256plus.c, src/xoshiro256starstar.c: New files.
	* src/xoshiro512plus.c, src/xoshiro512starstar.c: New files.
	* src/splitmix64.c, src/xoseed.c: New files.

2018-10-30  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* README.md: Descrition of new version of mum-hash.  New results
	for modern CPUs.
	* mum-hash (_MUM_FRESH_GCC): Remove.
	(_mum_rotl): New.
	(_mum_hash_aligned, _mum_final): Add new version.  Use MUM_V1 for
	the old code.
	(_mum_hash_avx2): Remove.
	(_mum_hash_default, mum_hash): Modify and simplify.
	* src/bench: Use environment variable MUM_ONLY for runing mum-hash
	only.  Add Meow Hash runs.  Use 16MB keys instead of 1KB ones.
	* src/bench.c (meowhash_test): New.
	(main): Use 16MB keys instead of 1KB ones.
	* meow_hash.h: New.

2016-08-10  Aras Pranckevičius <nearaz@gmail.com>
	    Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* mum.h: Permit unaligned access for _M_AMD64 and _M_IX86
	(Windows).
	* mum512.h: Ditto.
	* bench: Use CC instead of CXX for siphash.

2016-07-13  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* mum.h (_mum_hash_aligned, mum_hash_randomize): Make i type of
	size_t.
	* mum512.h (_mc_hash_aligned): Ditto.
	(_mc_init_state, _mc_hash_avx2, _mc_hash_default): Use cont for
	seed.

2016-06-14  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* README.md: Add results for Blake2.
	* src/{blake2b.c, blake2b-load-sse2.h, blake2b-load-sse41.h}: New.
	* src/{blake2b-round.h, blake2-config.h, blake2.h, blake2-impl.h}:
	New.
	* src/bench-crypto.c: Add code for testing Blake2.
	* src/bench-crypto: Ditto.

2016-06-07  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* README.md: Update speed numbers for all functions for aarch64.

y2016-06-06  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* src/bench.c: Use faster interface for xxHash.
	* README.md: Update xxHash speed numbers for x86-64 and ppc64.

2016-05-18  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* README.md: Update speed data for MUM and MUM512 and add info
	about testing MUM PRNG on NIST bigger data.

2016-05-18  Vladimir Makarov  <vmakarov@gcc.gnu.org>
	    Vsevolod Stakhov  <vsevolod@highsecure.ru>

	* mum512.h (_MC_FRESH_GCC): New. Use it as a guard for avx2 version.
	* mum-prng.h (_MUM_PRNG_FRESH_GCC): New. Use it as a guard for
	avx2 version.
	* mum.h (_MUM_FRESH_GCC): New. Use it as a guard for avx2 version.
	Remove clang guard for _MUM_OPTIMIZE etc.

2016-05-18  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* src/mum-prng.h: Move it to parent directory

2016-05-13  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* README.md: Some minor changes.

2016-05-13  Vladimir Makarov  <vmakarov@gcc.gnu.org>
	    Vsevolod Stakhov  <vsevolod@highsecure.ru>

	* src/bench.c: Include test for metro hash.
	* src/bench (COPTFLAGS, CC, CXX, LTO): New.  Use them.  Add runs for
	metro hash.
	* README.md: Update benchmark results, add results for MetroHash.
	* metrohash64.cpp: New.
	* metrohash64.h: New.
	* platform.h: New.

2016-05-13  Vsevolod Stakhov  <vsevolod@highsecure.ru>

	* mum.h: Add support for LLVM.

2016-05-13  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* mum.h (_mum_le32): New.
	(_mum_hash_aligned): Change the code to deal with uint64_t shifts
	and endianess.

2016-05-12  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* README.md: Add results for 5-byte string tests.
	* mum.h (uint16_t): New.
	(_mum_hash_aligned): Modify code to process a tail < 8 bytes.
	(_mum_hash_default): Use memmove instead of memcpy.
	* src/bench.c: Add test for 5-byte strings.
	* src/bench: Ditto.

2016-05-10  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* README.md: Highlight the new MUM PRNG speed.

2016-05-10  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* README.md: Describe a new version of MUM PRNG and update its
	speed.
	* src/mum-prng.h (MUM_PRNG_UNROLL): New.
	(EXPECT): New.
	(mum_prng_state): Rename to _mum_prng_state.  Add fields count and
	avx2_support.  Make state an array.
	(_mum_prng_setup_avx2, _mum_prn_update, _mum_prn_avx2_update):
	New.
	(_start_mum_prng): New.
	(init_mum_prng): Use _start_mum_prng.
	(set_mum_seed): Rename to set_mum_prng_seed.  Use _start_mum_prng.
	(get_mum_prn): Rewrite.
	* src/bench-prng.c (init_prng): Randomize multiplication
	constants.

2016-05-09  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* src/mum-prng.h (init_mum_prng): Use seed == 1.
	(get_mum_prn): Fix prns generation.
	* README.md: Update result for MUM PRNG.

2016-05-09  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* README.md: Add results for xxHash64 on AARCH64 and PPC64 and for
	xoroshiro128+.
	* src/xoroshiro128plus.c: New.
	* src/bench-prng.c: Add a code to test xoroshiro128+.
	* src/bench-prng: Add a run to test speed of xoroshiro128+.

2016-05-09  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* README.md: Add results for xxHash64.
	* src/xxhash.[ch]: New.
	* src/bench.c: Add code for xxHash64.
	(state, xxHash64_test): New.
	* src/bench: Add runs for xxHash64.

2016-05-08  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* README.md: Some editing.

2016-05-08  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* mum512.h (_mc_rotr): Decrease sh for sh >= 64.

2016-05-08  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* mum512.h (_mc_ti): Define depending on endianess.

2016-05-08  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* mum512.h (_mc_mul64): Change multiplication result names.
	(_mc_permute): Use _mc_xor.

2016-05-08  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	src/bench.c (main): Change input every iteration.

2016-05-08  Vladimir Makarov  <vmakarov@gcc.gnu.org>

	* mum.h: New file.
	* mum512.h: New file.
	* README.md: New file.
	* src/bbs-prng.h: Ditto.
	* src/bench: Ditto.
	* src/bench.c: Ditto.
	* src/bench-crypto: Ditto.
	* src/bench-crypto.c: Ditto.
	* src/bench-prng: Ditto.
	* src/bench-prng.c: Ditto.
	* src/byte_order.[ch]: Ditto.
	* src/chacha-prng.h: Ditto.
	* src/City.cpp: Ditto.
	* src/City.h: Ditto.
	* src/mum512-prng.h: Ditto.
	* src/mum-prng.h: Ditto.
	* src/sha3.[ch]: Ditto.
	* src/sha512.[ch]: Ditto.
	* src/sip24-prng.h: Ditto.
	* src/siphash24.c: Ditto.
	* src/Spooky.cpp: Ditto.
	* src/Spooky.h: Ditto.
	* src/ustd.h: Ditto.


================================================
FILE: README.md
================================================
# **Update (Nov. 28, 2025): Implemented collision attack prevention in VMUM and MUM-V3**
* The attack is described in Issue#18
* The code in question looks like
```
    state ^= _vmum (data[i] ^ _vmum_factors[i], data[i + 1] ^ _vmum_factors[i + 1]));
```
  It is easy to generate data which makes the 1st operand of `_vmum`
  to be zero.  In this case whatever the second operand is, the hash
  will be generated the same.  So an adversary can generate a lot of
  data with the same hash
* This is pretty common code mistake for a few fast hash-functions. At
  least I found the same vulnerability in [wyhash](https://github.com/wangyi-fudan/wyhash/blob/46cebe9dc4e51f94d0dca287733bc5a94f76a10d/wyhash.h#L130) and [rapidhash](https://github.com/Nicoshev/rapidhash/blob/d60698faa10916879f85b2799bfdc6996b94c2b7/rapidhash.h#L383)
* After the code change, the safe variants of VMUM and MUM hashes are
  switched on by default.  If you want previous variants, please use
  macros VMUM_V1 and MUM_V3 correspondingly.  I believe there are still
  cases when they can be used, e.g. for hash tables in compilers.
* The fix consist of checking _vmum operands on zero and use nonzero value instead
  * all checks are implemented to avoid branch instruction generations to keep hash calculation pipeline going
  * still the checks increase length of critical paths of calculation
  * in most cases, new versions of VMUM and MUM generates the same hashes as the previous versions
* The fix results in slowing down hash speeds by about **10%** according to my benchmarks
  * I updated all benchmark data related to the new versions of VMUM and MUM below
	
# MUM Hash
* MUM hash is a **fast non-cryptographic hash function**
  suitable for different hash table implementations
* MUM means **MU**ltiply and **M**ix
  * It is a name of the base transformation on which hashing is implemented
  * Modern processors have a fast logic to do long number multiplications
  * It is very attractive to use it for fast hashing
    * For example, 64x64-bit multiplication can do the same work as 32
      shifts and additions
  * I'd like to call it Multiply and Reduce.  Unfortunately, MUR
    (MUltiply and Rotate) is already taken for famous hashing
    technique designed by Austin Appleby
  * I've chosen the name also as the first release happened on Mother's day
* To use mum you just need one header file (mum.h)
* MUM hash passes **all** [SMHasher](https://github.com/aappleby/smhasher) tests
  * For comparison, only 4 out of 15 non-cryptographic hash functions
    in SMHasher passes the tests, e.g. well known FNV, Murmur2,
    Lookup, and Superfast hashes fail the tests
* MUM V3 hash does not pass the following tests of a more rigourous
  version of [SMHasher](https://github.com/rurban/smhasher):
  * It fails on Perlin noise and bad seeds tests.  It means it still
    qualitative enough for the most applications
  * To make MUM V3 to pass the Rurban SMHasher, macro `MUM_QUALITY` has been
    added.  Compilation with this defined macro makes MUM V3 to pass
    all tests of Rurban SMHasher.  The slowdown is about 5% in average
    or 10% at most on keys of length 8.  It also results in generating
    a target independent hash
* For historic reasons mum.h contains code for older version V1 and
  V2.  You can switch them on by defining macros **MUM_V1** and **MUM_V2**
* MUM algorithm is **simpler** than the VMUM one
* MUM is specifically **designed to be fast on 64-bit CPUs**
  * Still MUM will work for 32-bit CPUs and it will be sometimes
    faster than Spooky and City
* MUM has a **fast startup**.  It is particular good to hash small keys
  which are prevalent in hash table applications

# MUM implementation details

* Input 64-bit data is randomized by 64x64->128 bit multiplication and mixing
  high- and low-parts of the multiplication result by using addition.
  The result is mixed with the current internal state by using XOR
  * Instead of addition for mixing high- and low- parts, XOR could be
    used
    * Using addition instead of XOR improves performance by about
      10% on Haswell and Power7
* Factor numbers, randomly generated with an equal probability of their
  bit values, are used for the multiplication
* When all factors are used once, the internal state is randomized, and the same
  factors are used again for subsequent data randomization
* The main loop is formed to be **unrolled** by the compiler to benefit from the
  the compiler instruction scheduling optimization and OOO
  (out-of-order) instruction execution in modern CPUs
* MUM code does not contain assembly (asm) code anymore. This makes MUM less
  machine-dependent.  To have efficient mum implementation, the
  compiler should support 128-bit integer
  extension (true for GCC and Clang on many targets)

# VMUM Hash
* VMUM is a vector variant of mum hashing (see below)
  * It uses target SIMD instructions (insns)
  * In comparison with mum v3, vmum considerably (up to 3 times) improves the speed
    of hashing mid-range (32 to 256 bytes) to long-range (more 256 bytes) length keys
  * As with previous mum hashing, to use vmum you just need one header
    file (vmum.h)
  * vmum source code is considerably smaller than that of extremely
    fast xxHash3 and th1ha2 and competes with them on hashing speed
  * vmum passes a more rigorous version of
    [SMHasher](https://github.com/rurban/smhasher)
   
# VMUM implementation details
* For long keys vmum uses vector insns:
  * AVX2 256-bit vector insns on x86-64
  * Neon 128-bit vector insns on aarch64
  * Altivec 128-bit vector insns on ppc64
  * There is a scalar emulation of the vector insns, too, for other targets
	* This could be useful for understanding used the vector
      operations used
* You can add usage of vector insns for other targets.  For this you
    just need to add small functions `_vmum_update_block`,
    `_vmum_zero_block`, and `_vmum_fold_block`
  * For the beneficial usage of vector insns the target should have unsigned `32 x 32-bit ->
    64-bit` vector multiplication
* To run vector insns in parallel on OOO CPUs, two vmum code loops are formed
  to be **unrolled** by the compiler into one basic block
* I experimented a lot with other vector insns and found that the usage of
  carry-less (sometimes called polynomial) vector multiplication insns does not work
  well enough for hashing

# VMUM and MUM benchmarking vs other famous hash functions

* Here are the results of benchmarking VMUM and MUM with the fastest
  non-cryptographic hash functions I know:
  * Google City64 (sources are taken from SMHasher)
  * Bob Jenkins Spooky (sources are taken from SMHasher)
  * Yann Collet's xxHash3 (sources are taken from the
    [original repository](https://github.com/Cyan4973/xxHash))
* I also added J. Aumasson and D. Bernstein's
  [SipHash24](https://github.com/veorq/SipHash) for the comparison as it
  is a popular choice for hash table implementation these days
* A [metro hash](https://github.com/jandrewrogers/MetroHash)
  was added as people asked and as metro hash is
  claimed to be the fastest hash function
    * metro hash is not portable as others functions as it does not deal
      with the unaligned accesses problem on some targets
    * metro hash will produce different hash for LE/BE targets
* Measurements were done on 4 different architecture machines:
  * AMD Ryzen 9900X
  * Intel i5-1300K
  * IBM Power10
  * Apple M4 10 cores (mac mini)
* Hashing 10,000 of 16MB keys (bulk)
* Hashing 1,280M keys for all other length keys
* Each test was run 3 times and the minimal time was taken
  * GCC-14.2.1 was used on AMD and M4 machine, GCC-12.3.1 on Intel
    machine, GCC-11.5.0 was used on Power10
  * `-O3` was used for all compilations
  * The keys were generated by `rand` calls
  * The keys were aligned to see a hashing speed better and to permit runs for Metro
  * Some people complaint that my comparison is unfair as most hash functions are not inlined
    * I believe that the interface is the part of the implementation.  So when
      the interface does not provide an easy way for inlining, it is an
      implementation pitfall
    * Still to address the complaints I added `-flto` for benchmarking all hash
      functions excluding MUM and VMUM.  This option makes cross-file inlining
* Here are graphs summarizing the measurements:

![AMD](./benchmarks/amd.png)

![INTEL](./benchmarks/intel.png)

![M4](./benchmarks/m4.png)

![Power10](./benchmarks/power10.png)

* Exact numbers are given in the last section

# SMhasher Speed Measurements

* SMhasher also measures hash speeds.  It uses the CPU cycle counter (__rtdc)
  * __rtdc-based measurements might be inaccurate for a small number of
    executed insns as the process can migrate, not all insns can
    retire, and CPU freq can be different.  That is why I prefer long
    running benchmarks
* Here are the results on AMD Ryzen 9900X for the fastest quality hashes
  (chosen according to SMhasher bulk speed results from https://github.com/rurban/smhasher)
* More GB/sec is better.  Less cycles/hash is better
* Some hashes are based on the use of x86\_64 AES insns and are less portable.
  They are marked by "Yes" in the AES column 
* The SLOC column gives the source code lines to implement the hash
  
| Hash            | AES  | Bulk Speed (256KB): GB/s |Av. Speed on keys (1-32 bytes): cycles/hash| SLOC|
|:----------------|:----:|-------------------------:|------------------------------------------:|----:|
|VMUM-V2          |  -   |  103.7                   | 16.4                                      |459  |
|VMUM-V1          |  -   |  143.5                   | 16.8                                      |459  |
|MUM-V4           |  -   |   28.6                   | 15.8                                      |291  |
|MUM-V3           |  -   |   40.4                   | 16.3                                      |291  |
|xxh3             |  -   |   66.6                   | 17.6                                      |965  |
|umash64          |  -   |   63.1                   | 25.4                                      |1097 |
|FarmHash32       |  -   |   39.8                   | 32.6                                      |1423 |
|wyhash           |  -   |   39.3                   | 18.3                                      | 194 |
|clhash           |  -   |   38.4                   | 51.7                                      | 366 |
|t1ha2\_atonce    |  -   |   34.7                   | 25.5                                      |2262 |
|t1ha0\_aes\_avx2 | Yes  |  128.9                   | 25.0                                      |2262 |
|gxhash64         | Yes  |  197.1                   | 27.9                                      | 274 |
|aesni            | Yes  |   38.7                   | 28.5                                      | 132 |


# Using cryptographic vs. non-cryptographic hash function
  * People worrying about denial attacks based on generating hash
    collisions started to use cryptographic hash functions in hash tables
  * Cryptographic functions are very slow
    * *sha1* is about 20-30 times slower than MUM and City on the bulk speed tests
    * The new fastest cryptographic hash function *SipHash* is up to 10
      times slower
  * MUM and VMUM are also *resistant* to preimage attack (finding a
    key with a given hash) 
    * To make hard moving to previous state values we use mostly 1-to-1 one way
      function `lo(x*C) + hi(x*C)` where C is a constant.  Brute force
      solution of equation `f(x) = a` probably requires `2^63` tries.
      Another used function equation `x ^ y = a` has a `2^64`
      solutions.  It complicates finding the overal solution further
  * If somebody is not convinced, you can use **randomly chosen
    multiplication constants** (see functions `mum_hash_randomize` and
    `vmum_hash_randomize`).
    Finding a key with a given hash even if you know a key with such
    a hash probably will be close to finding two or more solutions of
    *Diophantine* equations
  * If somebody is still not convinced, you can implement hash tables
    to **recognize the attack and rebuild** the table using the MUM function
    with the new multiplication constants
  * Analogous approach can be used if you use weak hash function as
    MurMur or City.  Instead of using cryptographic hash functions
    **all the time**, hash tables can be implemented to recognize the
    attack and rebuild the table and start using a cryptographic hash
    function
  * This approach solves the speed problem and permits us to switch easily to a new
    cryptographic hash function if a flaw is found in the old one, e.g., switching from
    SipHash to SHA2
  
# How to use [V]MUM
* Please just include file `[v]mum.h` into your C/C++ program and use the following functions:
  * optional `[v]mum_hash_randomize` for choosing multiplication constants randomly
  * `[v]mum_hash_init`, `[v]mum_hash_step`, and `[v]mum_hash_finish` for hashing complex data structures
  * `[v]mum_hash64` for hashing a 64-bit data
  * `[v]mum_hash` for hashing any continuous block of data
  * Compile `vmum.h` with other code using options switching on vector
    insns if necessary (e.g. -mavx2 for x86\_64)
* To compare MUM and VMUM speed with other hash functions on your machine go to
  the directory `benchmarks` and run a script `./bench.sh`
* The script will compile source files and run the tests printing the
  results as a markdown table

# Crypto-hash function MUM512
  * [V]MUM is not designed to be a crypto-hash
    * The key (seed) and state are only 64-bit which are not crypto-level ones
    * The result can be different for different targets (BE/LE
      machines, 32- and 64-bit machines) as for other hash functions, e.g. City (hash can be
      different on SSE4.2 nad non SSE4.2 targets) or Spooky (BE/LE machines)
      * If you need the same MUM hash independent on the target, please
        define macro `[V]MUM_TARGET_INDEPENDENT_HASH`.  Defining the
        macro affects the performace only on big-endian targets or
        targets without int128 support
  * There is a variant of MUM called MUM512 which can be a **candidate**
    for a crypto-hash function and keyed crypto-hash function and
    might be interesting for researchers
    * The **key** is **256**-bit
    * The **state** and the **output** are **512**-bit
    * The **block** size is **512**-bit
    * It uses 128x128->256-bit multiplication which is analogous to about
      64 shifts and additions for 128-bit block word instead of 80
      rounds of shifts, additions, logical operations for 512-bit block
      in sha2-512.
  * It is **only a candidate** for a crypto hash function
    * I did not make any differential crypto-analysis or investigated
      probabilities of different attacks on the hash function (sorry, it
      is too big job)
      * I might be do this in the future as I am interested in
        differential characteristics of the MUM512 base transformation
        step (128x128-bit multiplications with addition of high and
        low 128-bit parts)
      * I am also interested in the right choice of the multiplication constants
      * May be somebody will do the analysis.  I will be glad to hear anything.
        Who knows, may be it can be easily broken as Nimbus cipher.
    * The current code might be also vulnerable to timing attack on
      systems with varying multiplication instruction latency time.
      There is no code for now to prevent it
  * To compare the MUM512 speed with the speed of SHA-2 (SHA512) and
    SHA-3 (SHA3-512) go to the directory `benchmarks` and run a script `./bench-crypto.sh`
    * SHA-2 and SHA-3 code is taken from [RHash](https://github.com/rhash/RHash.git)
  * Blake2 crypto-hash from [github.com/BLAKE2/BLAKE2](https://github.com/BLAKE2/BLAKE2)
    was added for comparison.  I use sse version of 64-bit Blake2 (blake2b).
  * Here is the speed of the crypto hash functions on AMD 9900X:

|                        | MUM512 | SHA2  |  SHA3  | Blake2B|
:------------------------|-------:|------:|-------:|-------:|
10 bytes (20 M texts)    | 0.27s  | 0.27s |  0.44s |  0.81s |
100 bytes (20 M texts)   | 0.36s  | 0.25s |  0.84s |  0.84s |
1000 bytes (20 M texts)  | 1.21s  | 2.08s | 5.63s  |  3.70s |
10000 bytes (5 M texts)  | 5.60s  | 5.05s | 14.07s |  7.99s |

# Pseudo-random generators
  * Files `mum-prng.h` and `mum512-prng.h` provide pseudo-random
    functions based on MUM and MUM512 hash functions
  * All PRNGs passed *NIST Statistical Test Suite for Random and
    Pseudorandom Number Generators for Cryptographic Applications*
    (version 2.2.1) with 1000 bitstreams each containing 1M bits
    * Although MUM PRNG passed the test, it is not a cryptographically
      secure PRNG as is the hash function used for it
  * To compare the PRNG speeds go to
    the directory `benchmarks` and run a script `./bench-prng.sh`
  * For the comparison I wrote crypto-secured Blum Blum Shub PRNG
    (file `bbs-prng.h`) and PRNGs based on fast cryto-level hash
    functions in ChaCha stream cipher (file `chacha-prng.h`) and
    SipHash24 (file `sip24-prng.h`).
    * The additional PRNGs also pass the Statistical Test Suite
  * For the comparison I also added the fastest PRNGs
    * [xoroshiro128+](http://xoroshiro.di.unimi.it/xoroshiro128plus.c)
    * [xoroshiro128**](http://xoroshiro.di.unimi.it/xoroshiro128starstar.c)
    * [xoshiro256+](http://xoroshiro.di.unimi.it/xoshiro256plus.c)
    * [xoshiro256**](http://xoroshiro.di.unimi.it/xoshiro256starstar.c)
    * [xoshiro512**](http://xoroshiro.di.unimi.it/xoshiro512starstar.c)
    * As recommended the first numbers generated by splitmix64 were used as a seed
  * I had no intention to tune MUM based PRNG first but
    after adding xoroshiro128+ and finding how fast it is, I've decided
    to speedup MUM PRNG
    * I added code to calculate a few PRNs at once to calculate them in parallel
    * I added AVX2 version functions to use the faster `MULX` instruction
    * The new version also passed NIST Statistical Test Suite.  It was
      tested even on bigger data (10K bitstreams each containing 10M
      bits).  The test took several days on i7-4790K
    * The new version is **almost 2 times** faster than the old one and MUM PRN
      speed became almost the same as xoroshiro/xoshiro ones
      * All xoroshiro/xoshiro and MUM PRNG functions are inlined in the benchmark program
      * Both code without inlining will be visibly slower and the speed
        difference will be negligible as one PRN calculation takes
        only about **3-4 machine cycle** for xoroshiro/xoshiro and MUM PRN.
  * **Update Nov.2 2019**: I found that MUM PRNG fails practrand on 512GB.  So I modified it.
    Instead of basically 16 independent PRNGs with 64-bit state, I made it one PRNG with 1024-bit state.
    I also managed to speed up MUM PRNG by 15%.
  * All PRNG were tested by [practrand](http://pracrand.sourceforge.net/) with
    4TB PRNG generated stream (it took a few days)
      * **GLIBC RAND, xoroshiro128+, xoshiro256+, and xoshiro512+ failed** on the first stages of practrand
      * The rest of the PRNGs passed
      * BBS PRNG was tested by only 64GB stream because it is too slow
  * Here is the speed of the PRNGs in millions generated PRNs
    per second:

|  M prns/sec  | AMD 9900X   |Intel i5-1360K| Apple M4    | Power10  |
:--------------|------------:|-------------:|------------:|---------:|
BBS            | 0.0886      | 0.0827       | 0.122       | 0.021    |
ChaCha         | 357.68      | 184.80       | 262.81      |  83.20   |
SipHash24      | 702.10      | 567.43       | 760.13      | 231.48   |
MUM512         |  91.54      | 179.62       | 268.04      |  44.28   |
MUM            |1947.27      |1620.65       |2263.68      | 694.42   |
XOSHIRO128**   |1797.02      |1386.87       |1095.37      | 477.67   |
XOSHIRO256**   |1866.35      |1364.85       |1466.15      | 607.65   |
XOSHIRO512**   |1663.86      |1235.15       |1423.90      | 631.90   |
GLIBC RAND     | 115.57      | 101.48       | 228.99      |  33.66   |
XOROSHIRO128+  |1786.62      |1299.59       |1296.48      | 549.85   |
XOSHIRO256+    |2321.99      |1720.67       |1690.96      | 711.41   |
XOSHIRO512+    |1808.81      |1525.18       |1659.76      | 717.12   |

# Table results for hash speed measurements
* Here are table variants of my measurements for people wanting the
  exact numbers.  The tables also contain time spent for hashing.
  
* AMD Ryzen 9900X:

| Length    |  VMUM-V2  |  VMUM-V1  |  MUM-V4   |  MUM-V3   |  Spooky   |   City    |  xxHash3  |   t1ha2   | SipHash24 |   Metro   |
|:----------|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|
|   3 bytes |1.00  4.57s|0.98  4.67s|0.98  4.64s|0.97  4.73s|0.76  6.01s|0.60  7.61s|0.94  4.84s|0.61  7.47s|0.61  7.51s|0.69  6.67s|
|   4 bytes |1.00  2.77s|1.00  2.78s|1.09  2.55s|1.09  2.55s|0.55  5.08s|0.39  7.15s|0.71  3.92s|0.69  4.03s|0.44  6.24s|0.75  3.69s|
|   5 bytes |1.00  4.63s|1.00  4.64s|1.00  4.62s|1.00  4.63s|0.80  5.78s|0.65  7.07s|0.88  5.28s|0.62  7.41s|0.61  7.59s|0.88  5.24s|
|   6 bytes |1.00  4.57s|1.00  4.56s|1.00  4.57s|1.00  4.56s|0.77  5.93s|0.65  7.06s|0.87  5.24s|0.62  7.38s|0.58  7.87s|0.87  5.24s|
|   7 bytes |1.00  4.84s|1.01  4.80s|1.01  4.80s|1.01  4.79s|0.79  6.15s|0.69  7.06s|0.92  5.24s|0.66  7.38s|0.60  8.10s|0.76  6.38s|
|   8 bytes |1.00  2.74s|1.00  2.74s|1.09  2.51s|1.09  2.51s|0.54  5.03s|0.39  7.06s|0.52  5.24s|0.69  3.97s|0.33  8.29s|0.75  3.67s|
|   9 bytes |1.00  3.01s|1.08  2.78s|1.07  2.82s|1.06  2.83s|0.59  5.06s|0.27 11.03s|0.45  6.66s|0.41  7.37s|0.36  8.29s|0.60  5.04s|
|  10 bytes |1.00  3.01s|1.08  2.78s|1.08  2.79s|1.04  2.89s|0.59  5.08s|0.27 11.02s|0.45  6.66s|0.41  7.36s|0.36  8.34s|0.60  5.05s|
|  11 bytes |1.00  3.01s|1.08  2.79s|1.08  2.79s|1.08  2.78s|0.59  5.08s|0.27 11.04s|0.45  6.67s|0.41  7.37s|0.36  8.32s|0.49  6.20s|
|  12 bytes |1.00  3.01s|1.09  2.77s|1.07  2.81s|1.07  2.82s|0.59  5.06s|0.27 11.03s|0.45  6.66s|0.41  7.35s|0.36  8.30s|0.60  5.02s|
|  13 bytes |1.00  2.98s|1.08  2.77s|1.05  2.83s|1.08  2.77s|0.59  5.02s|0.27 10.94s|0.45  6.61s|0.41  7.28s|0.36  8.21s|0.48  6.15s|
|  14 bytes |1.00  2.96s|1.08  2.74s|1.08  2.75s|1.08  2.74s|0.59  5.01s|0.27 10.95s|0.45  6.60s|0.41  7.29s|0.36  8.21s|0.48  6.16s|
|  15 bytes |1.00  2.98s|1.09  2.74s|1.08  2.77s|1.06  2.80s|0.59  5.01s|0.27 10.93s|0.45  6.61s|0.41  7.29s|0.36  8.21s|0.41  7.28s|
|  16 bytes |1.00  2.98s|1.09  2.74s|1.09  2.73s|1.09  2.73s|0.27 10.94s|0.41  7.28s|0.93  3.19s|0.59  5.08s|0.29 10.31s|0.62  4.78s|
|  32 bytes |1.00  3.28s|1.08  3.05s|1.00  3.27s|0.98  3.34s|0.30 10.95s|0.40  8.19s|1.03  3.19s|0.44  7.50s|0.23 14.39s|0.33  9.82s|
|  64 bytes |1.00  3.89s|1.09  3.58s|0.87  4.47s|0.85  4.58s|0.23 16.63s|0.47  8.36s|1.22  3.19s|0.68  5.69s|0.17 22.59s|0.37 10.49s|
|  96 bytes |1.00  4.55s|1.08  4.20s|0.79  5.79s|0.78  5.83s|0.20 22.31s|0.36 12.54s|1.43  3.19s|0.66  6.88s|0.15 31.11s|0.40 11.27s|
| 128 bytes |1.00  6.32s|1.40  4.52s|0.91  6.92s|0.90  7.06s|0.23 27.99s|0.50 12.54s|1.98  3.19s|0.83  7.57s|0.16 39.35s|0.53 11.85s|
| 192 bytes |1.00  8.55s|1.29  6.63s|0.90  9.46s|0.99  8.65s|0.36 23.81s|0.59 14.48s|1.25  6.83s|0.88  9.74s|0.15 55.96s|0.65 13.21s|
| 256 bytes |1.00 10.98s|1.38  7.95s|0.92 11.98s|1.06 10.32s|0.45 24.22s|0.66 16.71s|0.79 13.86s|0.92 11.89s|0.15 74.12s|0.75 14.57s|
| 512 bytes |1.00 14.42s|1.12 12.91s|0.65 22.33s|0.79 18.16s|0.41 35.30s|0.53 26.98s|0.91 15.92s|0.69 21.04s|0.10 140.39s|0.63 22.76s|
|1024 bytes |1.00 17.13s|1.09 15.75s|0.37 46.54s|0.50 34.26s|0.32 53.76s|0.38 45.34s|0.88 19.50s|0.44 38.78s|0.06 272.94s|0.44 39.07s|
| Bulk      |1.00  1.70s|1.36  1.25s|0.31  5.57s|0.44  3.85s|0.33  5.13s|0.34  4.94s|1.18  1.44s|0.37  4.62s|0.05 33.88s|0.40  4.26s|
| Average   |1.00       |1.11       |0.93       |0.96       |0.50       |0.43       |0.85       |0.58       |0.31       |0.59       |
| Geomean   |1.00       |1.10       |0.90       |0.93       |0.46       |0.41       |0.77       |0.55       |0.26       |0.56       |

* Intel i5-13600K:

| Length    |  VMUM-V2  |  VMUM-V1  |  MUM-V4   |  MUM-V3   |  Spooky   |   City    |  xxHash3  |   t1ha2   | SipHash24 |   Metro   |
|:----------|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|
|   3 bytes |1.00  4.67s|1.02  4.59s|1.03  4.53s|1.03  4.53s|0.67  6.94s|0.58  8.05s|0.93  5.03s|0.50  9.31s|0.47  9.93s|0.69  6.79s|
|   4 bytes |1.00  4.27s|1.00  4.27s|1.06  4.02s|1.06  4.02s|0.73  5.86s|0.51  8.37s|0.75  5.70s|0.77  5.58s|0.52  8.17s|0.81  5.28s|
|   5 bytes |1.00  4.66s|1.02  4.59s|1.03  4.53s|1.03  4.53s|0.73  6.37s|0.57  8.17s|0.82  5.70s|0.50  9.31s|0.44 10.49s|0.69  6.79s|
|   6 bytes |1.00  4.67s|1.02  4.56s|1.03  4.53s|1.03  4.53s|0.69  6.76s|0.57  8.17s|0.82  5.69s|0.50  9.31s|0.43 10.95s|0.69  6.79s|
|   7 bytes |1.00  4.87s|1.00  4.89s|1.01  4.83s|1.01  4.83s|0.69  7.02s|0.60  8.17s|0.85  5.70s|0.52  9.31s|0.45 10.89s|0.69  7.04s|
|   8 bytes |1.00  4.27s|1.00  4.28s|1.55  2.76s|1.55  2.76s|0.76  5.60s|0.53  8.08s|0.75  5.69s|0.99  4.30s|0.39 10.88s|1.06  4.02s|
|   9 bytes |1.00  4.53s|1.06  4.29s|1.50  3.02s|1.50  3.01s|0.81  5.60s|0.33 13.59s|0.53  8.58s|0.48  9.51s|0.42 10.88s|0.82  5.53s|
|  10 bytes |1.00  4.54s|1.06  4.29s|1.50  3.02s|1.51  3.01s|0.81  5.60s|0.33 13.59s|0.53  8.58s|0.48  9.51s|0.42 10.88s|0.82  5.53s|
|  11 bytes |1.00  4.52s|1.06  4.27s|1.50  3.02s|1.50  3.02s|0.81  5.60s|0.33 13.59s|0.53  8.58s|0.48  9.51s|0.42 10.88s|0.67  6.79s|
|  12 bytes |1.00  4.54s|1.06  4.29s|1.50  3.02s|1.51  3.01s|0.81  5.60s|0.33 13.59s|0.53  8.58s|0.48  9.51s|0.42 10.88s|0.82  5.53s|
|  13 bytes |1.00  4.52s|1.06  4.28s|1.49  3.03s|1.50  3.02s|0.81  5.60s|0.33 13.59s|0.53  8.59s|0.48  9.51s|0.42 10.88s|0.67  6.79s|
|  14 bytes |1.00  4.52s|1.06  4.27s|1.49  3.03s|1.50  3.02s|0.81  5.60s|0.33 13.59s|0.53  8.59s|0.48  9.51s|0.42 10.88s|0.67  6.79s|
|  15 bytes |1.00  4.53s|1.06  4.29s|1.50  3.02s|1.50  3.02s|0.81  5.60s|0.33 13.59s|0.53  8.58s|0.48  9.51s|0.42 10.88s|0.56  8.05s|
|  16 bytes |1.00  4.52s|1.06  4.28s|1.50  3.02s|1.50  3.01s|0.37 12.13s|0.56  8.05s|0.89  5.07s|0.85  5.29s|0.34 13.43s|0.83  5.46s|
|  32 bytes |1.00  4.79s|1.05  4.58s|1.30  3.69s|1.33  3.59s|0.39 12.38s|0.51  9.39s|0.94  5.10s|0.67  7.15s|0.25 18.92s|0.43 11.07s|
|  64 bytes |1.00  5.46s|1.06  5.15s|1.14  4.78s|1.11  4.91s|0.29 18.66s|0.58  9.36s|1.06  5.13s|0.88  6.22s|0.17 31.57s|0.46 11.83s|
|  96 bytes |1.00  6.43s|1.10  5.83s|0.84  7.68s|0.84  7.67s|0.26 25.17s|0.46 13.88s|1.23  5.23s|0.85  7.60s|0.15 42.71s|0.51 12.70s|
| 128 bytes |1.00  7.92s|1.25  6.36s|0.84  9.42s|0.86  9.19s|0.25 31.62s|0.57 13.87s|1.51  5.24s|0.91  8.67s|0.15 53.88s|0.59 13.51s|
| 192 bytes |1.00 11.52s|1.43  8.06s|1.05 11.02s|1.08 10.68s|0.39 29.49s|0.71 16.25s|1.23  9.34s|1.03 11.18s|0.15 76.23s|0.76 15.07s|
| 256 bytes |1.00 14.26s|1.60  8.89s|1.04 13.65s|1.18 12.11s|0.48 29.86s|0.75 19.06s|0.91 15.64s|1.03 13.82s|0.15 97.67s|0.85 16.68s|
| 512 bytes |1.00 15.93s|1.04 15.31s|0.62 25.67s|0.81 19.65s|0.35 45.39s|0.46 34.70s|0.96 16.68s|0.58 27.38s|0.09 186.04s|0.53 29.86s|
|1024 bytes |1.00 20.96s|1.08 19.32s|0.42 49.61s|0.56 37.74s|0.29 71.66s|0.34 61.75s|0.84 24.92s|0.42 49.86s|0.06 362.59s|0.36 58.67s|
| Bulk      |1.00  3.13s|1.09  2.86s|0.40  7.77s|0.61  5.17s|0.40  7.78s|0.42  7.37s|0.87  3.60s|0.49  6.38s|0.07 45.99s|0.45  6.88s|
| Average   |1.00       |1.10       |1.15       |1.18       |0.58       |0.48       |0.83       |0.65       |0.31       |0.67       |
| Geomean   |1.00       |1.09       |1.08       |1.13       |0.54       |0.46       |0.79       |0.62       |0.26       |0.65       |

* Apple M4:

| Length    |  VMUM-V2  |  VMUM-V1  |  MUM-V4   |  MUM-V3   |  Spooky   |   City    |  xxHash3  |   t1ha2   | SipHash24 |   Metro   |
|:----------|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|
|   3 bytes |1.00  5.15s|1.02  5.03s|1.03  5.02s|1.02  5.03s|0.69  7.50s|0.52  9.91s|0.87  5.92s|1.08  4.77s|0.51 10.01s|0.63  8.17s|
|   4 bytes |1.00  4.70s|1.01  4.66s|1.08  4.36s|1.08  4.36s|0.69  6.78s|0.49  9.64s|0.70  6.71s|0.99  4.77s|0.52  9.03s|0.73  6.41s|
|   5 bytes |1.00  5.05s|1.00  5.03s|1.01  5.01s|1.01  5.01s|0.71  7.08s|0.52  9.64s|0.74  6.78s|1.06  4.77s|0.49 10.38s|0.62  8.17s|
|   6 bytes |1.00  5.15s|1.02  5.03s|1.04  4.95s|1.04  4.95s|0.69  7.49s|0.53  9.64s|0.76  6.78s|1.08  4.76s|0.49 10.51s|0.63  8.17s|
|   7 bytes |1.00  5.52s|1.03  5.34s|1.04  5.32s|1.04  5.32s|0.71  7.82s|0.57  9.64s|0.81  6.79s|1.16  4.76s|0.51 10.77s|0.65  8.46s|
|   8 bytes |1.00  3.07s|1.02  3.02s|1.16  2.65s|1.16  2.65s|0.47  6.49s|0.32  9.64s|0.46  6.71s|0.68  4.53s|0.26 11.69s|0.66  4.66s|
|   9 bytes |1.00  3.26s|1.07  3.04s|1.03  3.17s|1.03  3.18s|0.50  6.48s|0.27 11.96s|0.56  5.85s|0.55  5.98s|0.28 11.69s|0.51  6.41s|
|  10 bytes |1.00  3.27s|1.08  3.03s|1.08  3.02s|1.08  3.02s|0.50  6.48s|0.27 11.96s|0.56  5.85s|0.55  5.98s|0.28 11.69s|0.51  6.41s|
|  11 bytes |1.00  3.38s|1.10  3.07s|1.11  3.05s|1.07  3.15s|0.52  6.48s|0.28 11.96s|0.58  5.85s|0.57  5.98s|0.29 11.69s|0.43  7.87s|
|  12 bytes |1.00  3.39s|1.08  3.13s|1.13  3.00s|1.13  3.00s|0.52  6.48s|0.28 11.96s|0.58  5.85s|0.57  5.98s|0.29 11.69s|0.53  6.41s|
|  13 bytes |1.00  3.33s|1.08  3.08s|1.10  3.04s|1.10  3.04s|0.51  6.48s|0.28 11.95s|0.57  5.85s|0.56  5.98s|0.28 11.69s|0.42  7.87s|
|  14 bytes |1.00  3.31s|1.09  3.05s|1.10  3.02s|1.09  3.03s|0.51  6.48s|0.28 11.96s|0.56  5.86s|0.55  5.98s|0.28 11.69s|0.42  7.87s|
|  15 bytes |1.00  3.32s|1.06  3.12s|1.08  3.07s|1.08  3.07s|0.51  6.48s|0.28 11.96s|0.57  5.85s|0.56  5.98s|0.28 11.69s|0.36  9.33s|
|  16 bytes |1.00  3.30s|1.10  3.00s|1.07  3.07s|1.07  3.08s|0.23 14.08s|0.35  9.33s|0.85  3.87s|0.55  5.98s|0.23 14.58s|0.54  6.12s|
|  32 bytes |1.00  3.66s|1.07  3.42s|1.01  3.64s|1.00  3.65s|0.26 14.07s|0.35 10.51s|0.96  3.80s|0.41  9.03s|0.18 20.66s|0.29 12.57s|
|  64 bytes |1.00  4.42s|1.09  4.07s|0.89  4.99s|0.89  4.99s|0.21 21.37s|0.41 10.84s|1.17  3.79s|0.62  7.11s|0.13 33.90s|0.33 13.44s|
|  96 bytes |1.00  5.16s|1.07  4.82s|0.82  6.27s|0.84  6.17s|0.18 28.70s|0.32 16.19s|1.36  3.80s|0.60  8.57s|0.11 45.54s|0.36 14.34s|
| 128 bytes |1.00  6.87s|1.13  6.08s|0.91  7.55s|0.93  7.40s|0.19 35.99s|0.42 16.19s|1.81  3.80s|0.72  9.53s|0.12 59.49s|0.45 15.22s|
| 192 bytes |1.00  8.65s|1.12  7.69s|0.82 10.60s|0.88  9.86s|0.27 31.55s|0.46 18.64s|0.88  9.86s|0.71 12.16s|0.10 84.13s|0.51 16.99s|
| 256 bytes |1.00  9.39s|1.30  7.20s|0.71 13.24s|0.76 12.29s|0.29 32.11s|0.44 21.28s|0.71 13.19s|0.63 14.87s|0.09 106.82s|0.50 18.77s|
| 512 bytes |1.00 14.79s|1.07 13.76s|0.69 21.33s|0.92 15.99s|0.29 50.41s|0.40 37.15s|0.91 16.28s|0.49 30.25s|0.07 204.80s|0.46 31.88s|
|1024 bytes |1.00 27.83s|1.56 17.79s|0.65 43.07s|1.04 26.70s|0.35 78.46s|0.44 62.77s|1.14 24.39s|0.51 54.28s|0.07 399.61s|0.55 50.90s|
| Bulk      |1.00  3.45s|1.39  2.49s|0.58  6.00s|1.13  3.06s|0.44  7.83s|0.51  6.70s|1.19  2.89s|0.54  6.36s|0.07 50.68s|0.67  5.13s|
| Average   |1.00       |1.11       |0.96       |1.02       |0.45       |0.39       |0.84       |0.68       |0.26       |0.51       |
| Geomean   |1.00       |1.10       |0.95       |1.01       |0.41       |0.38       |0.79       |0.66       |0.21       |0.50       |

* IBM Power10:

| Length    |  VMUM-V2  |  VMUM-V1  |  MUM-V4   |  MUM-V3   |  Spooky   |   City    |  xxHash3  |   t1ha2   | SipHash24 |   Metro   |
|:----------|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|
|   3 bytes |1.00 11.52s|1.00 11.53s|1.03 11.18s|1.03 11.21s|0.68 16.87s|0.61 18.95s|0.95 12.16s|1.09 10.57s|0.52 22.13s|0.66 17.35s|
|   4 bytes |1.00 10.86s|1.00 10.87s|1.07 10.19s|1.07 10.19s|0.72 15.18s|0.54 20.22s|0.89 12.27s|1.03 10.58s|0.51 21.13s|0.88 12.40s|
|   5 bytes |1.00 11.53s|0.98 11.73s|1.03 11.17s|1.03 11.17s|0.71 16.24s|0.55 20.91s|0.90 12.76s|1.09 10.58s|0.49 23.74s|0.66 17.35s|
|   6 bytes |1.00 11.52s|0.98 11.73s|1.03 11.17s|1.03 11.18s|0.68 16.87s|0.55 20.92s|0.90 12.76s|1.09 10.58s|0.48 23.91s|0.66 17.36s|
|   7 bytes |1.00 12.23s|1.02 11.96s|0.98 12.51s|1.03 11.84s|0.69 17.69s|0.58 20.92s|0.96 12.76s|1.16 10.56s|0.50 24.38s|0.56 22.01s|
|   8 bytes |1.00 10.85s|1.00 10.86s|1.06 10.19s|1.07 10.18s|0.76 14.27s|0.52 20.92s|0.85 12.75s|1.05 10.32s|0.37 29.14s|1.10  9.86s|
|   9 bytes |1.00 11.54s|1.06 10.87s|1.06 10.85s|1.06 10.85s|0.79 14.55s|0.40 28.92s|0.72 16.11s|0.84 13.73s|0.40 29.08s|0.84 13.80s|
|  10 bytes |1.00 11.53s|1.06 10.87s|1.06 10.85s|1.06 10.85s|0.79 14.54s|0.40 28.92s|0.72 16.10s|0.84 13.72s|0.40 29.17s|0.84 13.79s|
|  11 bytes |1.00 11.22s|1.03 10.87s|1.03 10.85s|1.03 10.85s|0.77 14.55s|0.39 28.90s|0.70 16.11s|0.82 13.72s|0.38 29.21s|0.66 17.08s|
|  12 bytes |1.00 11.53s|1.06 10.86s|1.06 10.86s|1.06 10.85s|0.79 14.54s|0.40 28.92s|0.72 16.11s|0.84 13.72s|0.40 29.13s|0.84 13.80s|
|  13 bytes |1.00 11.23s|1.03 10.87s|1.04 10.85s|1.04 10.85s|0.77 14.56s|0.39 28.91s|0.70 16.11s|0.82 13.72s|0.39 29.14s|0.66 17.09s|
|  14 bytes |1.00 11.23s|1.03 10.88s|1.03 10.87s|1.04 10.84s|0.77 14.55s|0.39 28.92s|0.70 16.11s|0.82 13.71s|0.38 29.20s|0.66 17.09s|
|  15 bytes |1.00 11.53s|1.06 10.88s|1.06 10.89s|1.06 10.89s|0.79 14.56s|0.40 28.91s|0.72 16.11s|0.84 13.72s|0.40 29.17s|0.57 20.38s|
|  16 bytes |1.00 12.20s|1.12 10.91s|1.12 10.85s|1.12 10.89s|0.44 27.70s|0.61 20.05s|1.36  8.96s|0.89 13.72s|0.31 39.95s|1.00 12.16s|
|  32 bytes |1.00 12.92s|1.11 11.59s|1.06 12.22s|1.06 12.22s|0.45 28.63s|0.58 22.34s|1.57  8.23s|0.64 20.32s|0.24 54.90s|0.52 24.97s|
|  64 bytes |1.00 14.54s|1.11 13.09s|0.97 15.02s|0.96 15.07s|0.35 41.29s|0.62 23.31s|1.70  8.54s|0.90 16.20s|0.16 88.17s|0.54 26.78s|
|  96 bytes |1.00 15.93s|1.13 14.07s|0.82 19.44s|0.96 16.64s|0.29 54.32s|0.45 35.33s|1.93  8.24s|0.81 19.57s|0.13 119.66s|0.55 28.81s|
| 128 bytes |1.00 16.71s|1.08 15.47s|0.75 22.21s|0.86 19.52s|0.24 68.72s|0.47 35.31s|2.03  8.22s|0.77 21.78s|0.11 156.55s|0.54 30.78s|
| 192 bytes |1.00 21.98s|1.16 18.99s|0.83 26.51s|0.83 26.60s|0.36 60.97s|0.53 41.32s|0.76 28.78s|0.76 29.01s|0.11 195.56s|0.62 35.17s|
| 256 bytes |1.00 22.31s|1.25 17.89s|0.73 30.47s|0.76 29.34s|0.36 62.61s|0.47 47.41s|0.67 33.50s|0.64 35.02s|0.09 252.22s|0.57 39.10s|
| 512 bytes |1.00 33.65s|1.19 28.16s|0.71 47.60s|0.74 45.60s|0.33 100.76s|0.46 73.34s|0.73 46.38s|0.56 60.20s|0.07 483.42s|0.60 55.93s|
|1024 bytes |1.00 56.70s|1.41 40.15s|0.69 81.61s|0.73 78.10s|0.34 167.11s|0.45 126.45s|0.80 70.61s|0.49 116.16s|0.06 944.76s|0.45 127.01s|
| Bulk      |1.00  6.75s|1.42  4.75s|0.70  9.62s|0.67 10.01s|0.38 17.80s|0.49 13.74s|0.93  7.27s|0.49 13.91s|0.06 118.69s|0.46 14.52s|
| Average   |1.00       |1.10       |0.95       |0.97       |0.58       |0.49       |1.00       |0.84       |0.30       |0.67       |
| Geomean   |1.00       |1.09       |0.94       |0.96       |0.54       |0.48       |0.93       |0.82       |0.24       |0.65       |


================================================
FILE: benchmarks/City.cpp
================================================
// Copyright (c) 2011 Google, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
// CityHash, by Geoff Pike and Jyrki Alakuijala
//
// This file provides CityHash64() and related functions.
//
// It's probably possible to create even faster hash functions by
// writing a program that systematically explores some of the space of
// possible hash functions, by using SIMD instructions, or by
// compromising on hash quality.

#include "City.h"

#include <algorithm>
#include <string.h>  // for memcpy and memset

using namespace std;

static uint64 UNALIGNED_LOAD64(const char *p) {
  uint64 result;
  memcpy(&result, p, sizeof(result));
  return result;
}

static uint32 UNALIGNED_LOAD32(const char *p) {
  uint32 result;
  memcpy(&result, p, sizeof(result));
  return result;
}

#ifndef __BIG_ENDIAN__

#define uint32_in_expected_order(x) (x)
#define uint64_in_expected_order(x) (x)

#else

#ifdef _MSC_VER
#include <stdlib.h>
#define bswap_32(x) _byteswap_ulong(x)
#define bswap_64(x) _byteswap_uint64(x)

#elif defined(__APPLE__)
// Mac OS X / Darwin features
#include <libkern/OSByteOrder.h>
#define bswap_32(x) OSSwapInt32(x)
#define bswap_64(x) OSSwapInt64(x)

#else
#include <byteswap.h>
#endif

#define uint32_in_expected_order(x) (bswap_32(x))
#define uint64_in_expected_order(x) (bswap_64(x))

#endif  // __BIG_ENDIAN__

#if !defined(LIKELY)
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
#define LIKELY(x) (__builtin_expect(!!(x), 1))
#else
#define LIKELY(x) (x)
#endif
#endif

static uint64 Fetch64(const char *p) {
  return uint64_in_expected_order(UNALIGNED_LOAD64(p));
}

static uint32 Fetch32(const char *p) {
  return uint32_in_expected_order(UNALIGNED_LOAD32(p));
}

// Some primes between 2^63 and 2^64 for various uses.
static const uint64 k0 = 0xc3a5c85c97cb3127ULL;
static const uint64 k1 = 0xb492b66fbe98f273ULL;
static const uint64 k2 = 0x9ae16a3b2f90404fULL;
static const uint64 k3 = 0xc949d7c7509e6557ULL;

// Bitwise right rotate.  Normally this will compile to a single
// instruction, especially if the shift is a manifest constant.
static uint64 Rotate(uint64 val, int shift) {
  // Avoid shifting by 64: doing so yields an undefined result.
  return shift == 0 ? val : ((val >> shift) | (val << (64 - shift)));
}

// Equivalent to Rotate(), but requires the second arg to be non-zero.
// On x86-64, and probably others, it's possible for this to compile
// to a single instruction if both args are already in registers.
static uint64 RotateByAtLeast1(uint64 val, int shift) {
  return (val >> shift) | (val << (64 - shift));
}

static uint64 ShiftMix(uint64 val) {
  return val ^ (val >> 47);
}

static uint64 HashLen16(uint64 u, uint64 v) {
  return Hash128to64(uint128(u, v));
}

static uint64 HashLen0to16(const char *s, size_t len) {
  if (len > 8) {
    uint64 a = Fetch64(s);
    uint64 b = Fetch64(s + len - 8);
    return HashLen16(a, RotateByAtLeast1(b + len, len)) ^ b;
  }
  if (len >= 4) {
    uint64 a = Fetch32(s);
    return HashLen16(len + (a << 3), Fetch32(s + len - 4));
  }
  if (len > 0) {
    uint8 a = s[0];
    uint8 b = s[len >> 1];
    uint8 c = s[len - 1];
    uint32 y = static_cast<uint32>(a) + (static_cast<uint32>(b) << 8);
    uint32 z = len + (static_cast<uint32>(c) << 2);
    return ShiftMix(y * k2 ^ z * k3) * k2;
  }
  return k2;
}

// This probably works well for 16-byte strings as well, but it may be overkill
// in that case.
static uint64 HashLen17to32(const char *s, size_t len) {
  uint64 a = Fetch64(s) * k1;
  uint64 b = Fetch64(s + 8);
  uint64 c = Fetch64(s + len - 8) * k2;
  uint64 d = Fetch64(s + len - 16) * k0;
  return HashLen16(Rotate(a - b, 43) + Rotate(c, 30) + d,
                   a + Rotate(b ^ k3, 20) - c + len);
}

// Return a 16-byte hash for 48 bytes.  Quick and dirty.
// Callers do best to use "random-looking" values for a and b.
static pair<uint64, uint64> WeakHashLen32WithSeeds(
    uint64 w, uint64 x, uint64 y, uint64 z, uint64 a, uint64 b) {
  a += w;
  b = Rotate(b + a + z, 21);
  uint64 c = a;
  a += x;
  a += y;
  b += Rotate(a, 44);
  return make_pair(a + z, b + c);
}

// Return a 16-byte hash for s[0] ... s[31], a, and b.  Quick and dirty.
static pair<uint64, uint64> WeakHashLen32WithSeeds(
    const char* s, uint64 a, uint64 b) {
  return WeakHashLen32WithSeeds(Fetch64(s),
                                Fetch64(s + 8),
                                Fetch64(s + 16),
                                Fetch64(s + 24),
                                a,
                                b);
}

// Return an 8-byte hash for 33 to 64 bytes.
static uint64 HashLen33to64(const char *s, size_t len) {
  uint64 z = Fetch64(s + 24);
  uint64 a = Fetch64(s) + (len + Fetch64(s + len - 16)) * k0;
  uint64 b = Rotate(a + z, 52);
  uint64 c = Rotate(a, 37);
  a += Fetch64(s + 8);
  c += Rotate(a, 7);
  a += Fetch64(s + 16);
  uint64 vf = a + z;
  uint64 vs = b + Rotate(a, 31) + c;
  a = Fetch64(s + 16) + Fetch64(s + len - 32);
  z = Fetch64(s + len - 8);
  b = Rotate(a + z, 52);
  c = Rotate(a, 37);
  a += Fetch64(s + len - 24);
  c += Rotate(a, 7);
  a += Fetch64(s + len - 16);
  uint64 wf = a + z;
  uint64 ws = b + Rotate(a, 31) + c;
  uint64 r = ShiftMix((vf + ws) * k2 + (wf + vs) * k0);
  return ShiftMix(r * k0 + vs) * k2;
}

uint64 CityHash64(const char *s, size_t len) {
  if (len <= 32) {
    if (len <= 16) {
      return HashLen0to16(s, len);
    } else {
      return HashLen17to32(s, len);
    }
  } else if (len <= 64) {
    return HashLen33to64(s, len);
  }

  // For strings over 64 bytes we hash the end first, and then as we
  // loop we keep 56 bytes of state: v, w, x, y, and z.
  uint64 x = Fetch64(s + len - 40);
  uint64 y = Fetch64(s + len - 16) + Fetch64(s + len - 56);
  uint64 z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24));
  pair<uint64, uint64> v = WeakHashLen32WithSeeds(s + len - 64, len, z);
  pair<uint64, uint64> w = WeakHashLen32WithSeeds(s + len - 32, y + k1, x);
  x = x * k1 + Fetch64(s);

  // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks.
  len = (len - 1) & ~static_cast<size_t>(63);
  do {
    x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
    x ^= w.second;
    y += v.first + Fetch64(s + 40);
    z = Rotate(z + w.first, 33) * k1;
    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
    std::swap(z, x);
    s += 64;
    len -= 64;
  } while (len != 0);
  return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z,
                   HashLen16(v.second, w.second) + x);
}

uint64 CityHash64WithSeed(const char *s, size_t len, uint64 seed) {
  return CityHash64WithSeeds(s, len, k2, seed);
}

uint64 CityHash64WithSeeds(const char *s, size_t len,
                           uint64 seed0, uint64 seed1) {
  return HashLen16(CityHash64(s, len) - seed0, seed1);
}

// A subroutine for CityHash128().  Returns a decent 128-bit hash for strings
// of any length representable in signed long.  Based on City and Murmur.
static uint128 CityMurmur(const char *s, size_t len, uint128 seed) {
  uint64 a = Uint128Low64(seed);
  uint64 b = Uint128High64(seed);
  uint64 c = 0;
  uint64 d = 0;
  signed long l = len - 16;
  if (l <= 0) {  // len <= 16
    a = ShiftMix(a * k1) * k1;
    c = b * k1 + HashLen0to16(s, len);
    d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c));
  } else {  // len > 16
    c = HashLen16(Fetch64(s + len - 8) + k1, a);
    d = HashLen16(b + len, c + Fetch64(s + len - 16));
    a += d;
    do {
      a ^= ShiftMix(Fetch64(s) * k1) * k1;
      a *= k1;
      b ^= a;
      c ^= ShiftMix(Fetch64(s + 8) * k1) * k1;
      c *= k1;
      d ^= c;
      s += 16;
      l -= 16;
    } while (l > 0);
  }
  a = HashLen16(a, c);
  b = HashLen16(d, b);
  return uint128(a ^ b, HashLen16(b, a));
}

uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed) {
  if (len < 128) {
    return CityMurmur(s, len, seed);
  }

  // We expect len >= 128 to be the common case.  Keep 56 bytes of state:
  // v, w, x, y, and z.
  pair<uint64, uint64> v, w;
  uint64 x = Uint128Low64(seed);
  uint64 y = Uint128High64(seed);
  uint64 z = len * k1;
  v.first = Rotate(y ^ k1, 49) * k1 + Fetch64(s);
  v.second = Rotate(v.first, 42) * k1 + Fetch64(s + 8);
  w.first = Rotate(y + z, 35) * k1 + x;
  w.second = Rotate(x + Fetch64(s + 88), 53) * k1;

  // This is the same inner loop as CityHash64(), manually unrolled.
  do {
    x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
    x ^= w.second;
    y += v.first + Fetch64(s + 40);
    z = Rotate(z + w.first, 33) * k1;
    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
    std::swap(z, x);
    s += 64;
    x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
    x ^= w.second;
    y += v.first + Fetch64(s + 40);
    z = Rotate(z + w.first, 33) * k1;
    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
    std::swap(z, x);
    s += 64;
    len -= 128;
  } while (LIKELY(len >= 128));
  x += Rotate(v.first + z, 49) * k0;
  z += Rotate(w.first, 37) * k0;
  // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s.
  for (size_t tail_done = 0; tail_done < len; ) {
    tail_done += 32;
    y = Rotate(x + y, 42) * k0 + v.second;
    w.first += Fetch64(s + len - tail_done + 16);
    x = x * k0 + w.first;
    z += w.second + Fetch64(s + len - tail_done);
    w.second += v.first;
    v = WeakHashLen32WithSeeds(s + len - tail_done, v.first + z, v.second);
  }
  // At this point our 56 bytes of state should contain more than
  // enough information for a strong 128-bit hash.  We use two
  // different 56-byte-to-8-byte hashes to get a 16-byte final result.
  x = HashLen16(x, v.first);
  y = HashLen16(y + z, w.first);
  return uint128(HashLen16(x + v.second, w.second) + y,
                 HashLen16(x + w.second, y + v.second));
}

uint128 CityHash128(const char *s, size_t len) {
  if (len >= 16) {
    return CityHash128WithSeed(s + 16,
                               len - 16,
                               uint128(Fetch64(s) ^ k3,
                                       Fetch64(s + 8)));
  } else if (len >= 8) {
    return CityHash128WithSeed(NULL,
                               0,
                               uint128(Fetch64(s) ^ (len * k0),
                                       Fetch64(s + len - 8) ^ k1));
  } else {
    return CityHash128WithSeed(s, len, uint128(k0, k1));
  }
}

#if defined(__SSE4_2__) && defined(__x86_64__)
#include <nmmintrin.h>

// Requires len >= 240.
static void CityHashCrc256Long(const char *s, size_t len,
                               uint32 seed, uint64 *result) {
  uint64 a = Fetch64(s + 56) + k0;
  uint64 b = Fetch64(s + 96) + k0;
  uint64 c = result[0] = HashLen16(b, len);
  uint64 d = result[1] = Fetch64(s + 120) * k0 + len;
  uint64 e = Fetch64(s + 184) + seed;
  uint64 f = seed;
  uint64 g = 0;
  uint64 h = 0;
  uint64 i = 0;
  uint64 j = 0;
  uint64 t = c + d;

  // 240 bytes of input per iter.
  size_t iters = len / 240;
  len -= iters * 240;
  do {
#define CHUNK(multiplier, z)                                    \
    {                                                           \
      uint64 old_a = a;                                         \
      a = Rotate(b, 41 ^ z) * multiplier + Fetch64(s);          \
      b = Rotate(c, 27 ^ z) * multiplier + Fetch64(s + 8);      \
      c = Rotate(d, 41 ^ z) * multiplier + Fetch64(s + 16);     \
      d = Rotate(e, 33 ^ z) * multiplier + Fetch64(s + 24);     \
      e = Rotate(t, 25 ^ z) * multiplier + Fetch64(s + 32);     \
      t = old_a;                                                \
    }                                                           \
    f = _mm_crc32_u64(f, a);                                    \
    g = _mm_crc32_u64(g, b);                                    \
    h = _mm_crc32_u64(h, c);                                    \
    i = _mm_crc32_u64(i, d);                                    \
    j = _mm_crc32_u64(j, e);                                    \
    s += 40

    CHUNK(1, 1); CHUNK(k0, 0);
    CHUNK(1, 1); CHUNK(k0, 0);
    CHUNK(1, 1); CHUNK(k0, 0);
  } while (--iters > 0);

  while (len >= 40) {
    CHUNK(k0, 0);
    len -= 40;
  }
  if (len > 0) {
    s = s + len - 40;
    CHUNK(k0, 0);
  }
  j += i << 32;
  a = HashLen16(a, j);
  h += g << 32;
  b += h;
  c = HashLen16(c, f) + i;
  d = HashLen16(d, e + result[0]);
  j += e;
  i += HashLen16(h, t);
  e = HashLen16(a, d) + j;
  f = HashLen16(b, c) + a;
  g = HashLen16(j, i) + c;
  result[0] = e + f + g + h;
  a = ShiftMix((a + g) * k0) * k0 + b;
  result[1] += a + result[0];
  a = ShiftMix(a * k0) * k0 + c;
  result[2] = a + result[1];
  a = ShiftMix((a + e) * k0) * k0;
  result[3] = a + result[2];
}

// Requires len < 240.
static void CityHashCrc256Short(const char *s, size_t len, uint64 *result) {
  char buf[240];
  memcpy(buf, s, len);
  memset(buf + len, 0, 240 - len);
  CityHashCrc256Long(buf, 240, ~static_cast<uint32>(len), result);
}

void CityHashCrc256(const char *s, size_t len, uint64 *result) {
  if (LIKELY(len >= 240)) {
    CityHashCrc256Long(s, len, 0, result);
  } else {
    CityHashCrc256Short(s, len, result);
  }
}

uint128 CityHashCrc128WithSeed(const char *s, size_t len, uint128 seed) {
  if (len <= 900) {
    return CityHash128WithSeed(s, len, seed);
  } else {
    uint64 result[4];
    CityHashCrc256(s, len, result);
    uint64 u = Uint128High64(seed) + result[0];
    uint64 v = Uint128Low64(seed) + result[1];
    return uint128(HashLen16(u, v + result[2]),
                   HashLen16(Rotate(v, 32), u * k0 + result[3]));
  }
}

uint128 CityHashCrc128(const char *s, size_t len) {
  if (len <= 900) {
    return CityHash128(s, len);
  } else {
    uint64 result[4];
    CityHashCrc256(s, len, result);
    return uint128(result[2], result[3]);
  }
}

#endif


================================================
FILE: benchmarks/City.h
================================================
// Copyright (c) 2011 Google, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
// CityHash, by Geoff Pike and Jyrki Alakuijala
//
// This file provides a few functions for hashing strings. On x86-64
// hardware in 2011, CityHash64() is faster than other high-quality
// hash functions, such as Murmur.  This is largely due to higher
// instruction-level parallelism.  CityHash64() and CityHash128() also perform
// well on hash-quality tests.
//
// CityHash128() is optimized for relatively long strings and returns
// a 128-bit hash.  For strings more than about 2000 bytes it can be
// faster than CityHash64().
//
// Functions in the CityHash family are not suitable for cryptography.
//
// WARNING: This code has not been tested on big-endian platforms!
// It is known to work well on little-endian platforms that have a small penalty
// for unaligned reads, such as current Intel and AMD moderate-to-high-end CPUs.
//
// By the way, for some hash functions, given strings a and b, the hash
// of a+b is easily derived from the hashes of a and b.  This property
// doesn't hold for any hash functions in this file.

#ifndef CITY_HASH_H_
#define CITY_HASH_H_

#include <stdlib.h>  // for size_t.
#include <utility>

// Microsoft Visual Studio may not have stdint.h.
#if defined(_MSC_VER) && (_MSC_VER < 1600)
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned __int64 uint64_t;
#else  // defined(_MSC_VER)
#include <stdint.h>
#endif // !defined(_MSC_VER)

typedef uint8_t uint8;
typedef uint32_t uint32;
typedef uint64_t uint64;
typedef std::pair<uint64, uint64> uint128;

inline uint64 Uint128Low64(const uint128& x) { return x.first; }
inline uint64 Uint128High64(const uint128& x) { return x.second; }

// Hash function for a byte array.
uint64 CityHash64(const char *buf, size_t len);

// Hash function for a byte array.  For convenience, a 64-bit seed is also
// hashed into the result.
uint64 CityHash64WithSeed(const char *buf, size_t len, uint64 seed);

// Hash function for a byte array.  For convenience, two seeds are also
// hashed into the result.
uint64 CityHash64WithSeeds(const char *buf, size_t len,
                           uint64 seed0, uint64 seed1);

// Hash function for a byte array.
uint128 CityHash128(const char *s, size_t len);

// Hash function for a byte array.  For convenience, a 128-bit seed is also
// hashed into the result.
uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed);

// Hash 128 input bits down to 64 bits of output.
// This is intended to be a reasonably good hash function.
inline uint64 Hash128to64(const uint128& x) {
  // Murmur-inspired hashing.
  const uint64 kMul = 0x9ddfea08eb382d69ULL;
  uint64 a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul;
  a ^= (a >> 47);
  uint64 b = (Uint128High64(x) ^ a) * kMul;
  b ^= (b >> 47);
  b *= kMul;
  return b;
}

// Conditionally include declarations for versions of City that require SSE4.2
// instructions to be available.
#if defined(__SSE4_2__) && defined(__x86_64__)

// Hash function for a byte array.
uint128 CityHashCrc128(const char *s, size_t len);

// Hash function for a byte array.  For convenience, a 128-bit seed is also
// hashed into the result.
uint128 CityHashCrc128WithSeed(const char *s, size_t len, uint128 seed);

// Hash function for a byte array.  Sets result[0] ... result[3].
void CityHashCrc256(const char *s, size_t len, uint64 *result);

#endif  // __SSE4_2__

#endif  // CITY_HASH_H_


================================================
FILE: benchmarks/SpookyV2.cpp
================================================
// Spooky Hash
// A 128-bit noncryptographic hash, for checksums and table lookup
// By Bob Jenkins.  Public domain.
//   Oct 31 2010: published framework, disclaimer ShortHash isn't right
//   Nov 7 2010: disabled ShortHash
//   Oct 31 2011: replace End, ShortMix, ShortEnd, enable ShortHash again
//   April 10 2012: buffer overflow on platforms without unaligned reads
//   July 12 2012: was passing out variables in final to in/out in short
//   July 30 2012: I reintroduced the buffer overflow
//   August 5 2012: SpookyV2: d = should be d += in short hash, and remove extra mix from long hash

#include <memory.h>
#include "SpookyV2.h"

#define ALLOW_UNALIGNED_READS 1

//
// short hash ... it could be used on any message, 
// but it's used by Spooky just for short messages.
//
void SpookyHash::Short(
    const void *message,
    size_t length,
    uint64 *hash1,
    uint64 *hash2)
{
    uint64 buf[2*sc_numVars];
    union 
    { 
        const uint8 *p8; 
        uint32 *p32;
        uint64 *p64; 
        size_t i; 
    } u;

    u.p8 = (const uint8 *)message;
    
    if (!ALLOW_UNALIGNED_READS && (u.i & 0x7))
    {
        memcpy(buf, message, length);
        u.p64 = buf;
    }

    size_t remainder = length%32;
    uint64 a=*hash1;
    uint64 b=*hash2;
    uint64 c=sc_const;
    uint64 d=sc_const;

    if (length > 15)
    {
        const uint64 *end = u.p64 + (length/32)*4;
        
        // handle all complete sets of 32 bytes
        for (; u.p64 < end; u.p64 += 4)
        {
            c += u.p64[0];
            d += u.p64[1];
            ShortMix(a,b,c,d);
            a += u.p64[2];
            b += u.p64[3];
        }
        
        //Handle the case of 16+ remaining bytes.
        if (remainder >= 16)
        {
            c += u.p64[0];
            d += u.p64[1];
            ShortMix(a,b,c,d);
            u.p64 += 2;
            remainder -= 16;
        }
    }
    
    // Handle the last 0..15 bytes, and its length
    d += ((uint64)length) << 56;
    switch (remainder)
    {
    case 15:
    d += ((uint64)u.p8[14]) << 48;
    case 14:
        d += ((uint64)u.p8[13]) << 40;
    case 13:
        d += ((uint64)u.p8[12]) << 32;
    case 12:
        d += u.p32[2];
        c += u.p64[0];
        break;
    case 11:
        d += ((uint64)u.p8[10]) << 16;
    case 10:
        d += ((uint64)u.p8[9]) << 8;
    case 9:
        d += (uint64)u.p8[8];
    case 8:
        c += u.p64[0];
        break;
    case 7:
        c += ((uint64)u.p8[6]) << 48;
    case 6:
        c += ((uint64)u.p8[5]) << 40;
    case 5:
        c += ((uint64)u.p8[4]) << 32;
    case 4:
        c += u.p32[0];
        break;
    case 3:
        c += ((uint64)u.p8[2]) << 16;
    case 2:
        c += ((uint64)u.p8[1]) << 8;
    case 1:
        c += (uint64)u.p8[0];
        break;
    case 0:
        c += sc_const;
        d += sc_const;
    }
    ShortEnd(a,b,c,d);
    *hash1 = a;
    *hash2 = b;
}


// do the whole hash in one call
void SpookyHash::Hash128(
    const void *message, 
    size_t length, 
    uint64 *hash1, 
    uint64 *hash2)
{
    if (length < sc_bufSize)
    {
        Short(message, length, hash1, hash2);
        return;
    }

    uint64 h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11;
    uint64 buf[sc_numVars];
    uint64 *end;
    union 
    { 
        const uint8 *p8; 
        uint64 *p64; 
        size_t i; 
    } u;
    size_t remainder;
    
    h0=h3=h6=h9  = *hash1;
    h1=h4=h7=h10 = *hash2;
    h2=h5=h8=h11 = sc_const;
    
    u.p8 = (const uint8 *)message;
    end = u.p64 + (length/sc_blockSize)*sc_numVars;

    // handle all whole sc_blockSize blocks of bytes
    if (ALLOW_UNALIGNED_READS || ((u.i & 0x7) == 0))
    {
        while (u.p64 < end)
        { 
            Mix(u.p64, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
	    u.p64 += sc_numVars;
        }
    }
    else
    {
        while (u.p64 < end)
        {
            memcpy(buf, u.p64, sc_blockSize);
            Mix(buf, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
	    u.p64 += sc_numVars;
        }
    }

    // handle the last partial block of sc_blockSize bytes
    remainder = (length - ((const uint8 *)end-(const uint8 *)message));
    memcpy(buf, end, remainder);
    memset(((uint8 *)buf)+remainder, 0, sc_blockSize-remainder);
    ((uint8 *)buf)[sc_blockSize-1] = remainder;
    
    // do some final mixing 
    End(buf, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
    *hash1 = h0;
    *hash2 = h1;
}


// init spooky state
void SpookyHash::Init(uint64 seed1, uint64 seed2)
{
    m_length = 0;
    m_remainder = 0;
    m_state[0] = seed1;
    m_state[1] = seed2;
}


// add a message fragment to the state
void SpookyHash::Update(const void *message, size_t length)
{
    uint64 h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11;
    size_t newLength = length + m_remainder;
    uint8  remainder;
    union 
    { 
        const uint8 *p8; 
        uint64 *p64; 
        size_t i; 
    } u;
    const uint64 *end;
    
    // Is this message fragment too short?  If it is, stuff it away.
    if (newLength < sc_bufSize)
    {
        memcpy(&((uint8 *)m_data)[m_remainder], message, length);
        m_length = length + m_length;
        m_remainder = (uint8)newLength;
        return;
    }
    
    // init the variables
    if (m_length < sc_bufSize)
    {
        h0=h3=h6=h9  = m_state[0];
        h1=h4=h7=h10 = m_state[1];
        h2=h5=h8=h11 = sc_const;
    }
    else
    {
        h0 = m_state[0];
        h1 = m_state[1];
        h2 = m_state[2];
        h3 = m_state[3];
        h4 = m_state[4];
        h5 = m_state[5];
        h6 = m_state[6];
        h7 = m_state[7];
        h8 = m_state[8];
        h9 = m_state[9];
        h10 = m_state[10];
        h11 = m_state[11];
    }
    m_length = length + m_length;
    
    // if we've got anything stuffed away, use it now
    if (m_remainder)
    {
        uint8 prefix = sc_bufSize-m_remainder;
        memcpy(&(((uint8 *)m_data)[m_remainder]), message, prefix);
        u.p64 = m_data;
        Mix(u.p64, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
        Mix(&u.p64[sc_numVars], h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
        u.p8 = ((const uint8 *)message) + prefix;
        length -= prefix;
    }
    else
    {
        u.p8 = (const uint8 *)message;
    }
    
    // handle all whole blocks of sc_blockSize bytes
    end = u.p64 + (length/sc_blockSize)*sc_numVars;
    remainder = (uint8)(length-((const uint8 *)end-u.p8));
    if (ALLOW_UNALIGNED_READS || (u.i & 0x7) == 0)
    {
        while (u.p64 < end)
        { 
            Mix(u.p64, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
	    u.p64 += sc_numVars;
        }
    }
    else
    {
        while (u.p64 < end)
        { 
            memcpy(m_data, u.p8, sc_blockSize);
            Mix(m_data, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
	    u.p64 += sc_numVars;
        }
    }

    // stuff away the last few bytes
    m_remainder = remainder;
    memcpy(m_data, end, remainder);
    
    // stuff away the variables
    m_state[0] = h0;
    m_state[1] = h1;
    m_state[2] = h2;
    m_state[3] = h3;
    m_state[4] = h4;
    m_state[5] = h5;
    m_state[6] = h6;
    m_state[7] = h7;
    m_state[8] = h8;
    m_state[9] = h9;
    m_state[10] = h10;
    m_state[11] = h11;
}


// report the hash for the concatenation of all message fragments so far
void SpookyHash::Final(uint64 *hash1, uint64 *hash2)
{
    // init the variables
    if (m_length < sc_bufSize)
    {
        *hash1 = m_state[0];
        *hash2 = m_state[1];
        Short( m_data, m_length, hash1, hash2);
        return;
    }
    
    const uint64 *data = (const uint64 *)m_data;
    uint8 remainder = m_remainder;
    
    uint64 h0 = m_state[0];
    uint64 h1 = m_state[1];
    uint64 h2 = m_state[2];
    uint64 h3 = m_state[3];
    uint64 h4 = m_state[4];
    uint64 h5 = m_state[5];
    uint64 h6 = m_state[6];
    uint64 h7 = m_state[7];
    uint64 h8 = m_state[8];
    uint64 h9 = m_state[9];
    uint64 h10 = m_state[10];
    uint64 h11 = m_state[11];

    if (remainder >= sc_blockSize)
    {
        // m_data can contain two blocks; handle any whole first block
        Mix(data, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
        data += sc_numVars;
        remainder -= sc_blockSize;
    }

    // mix in the last partial block, and the length mod sc_blockSize
    memset(&((uint8 *)data)[remainder], 0, (sc_blockSize-remainder));

    ((uint8 *)data)[sc_blockSize-1] = remainder;
    
    // do some final mixing
    End(data, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);

    *hash1 = h0;
    *hash2 = h1;
}


================================================
FILE: benchmarks/SpookyV2.h
================================================
//
// SpookyHash: a 128-bit noncryptographic hash function
// By Bob Jenkins, public domain
//   Oct 31 2010: alpha, framework + SpookyHash::Mix appears right
//   Oct 31 2011: alpha again, Mix only good to 2^^69 but rest appears right
//   Dec 31 2011: beta, improved Mix, tested it for 2-bit deltas
//   Feb  2 2012: production, same bits as beta
//   Feb  5 2012: adjusted definitions of uint* to be more portable
//   Mar 30 2012: 3 bytes/cycle, not 4.  Alpha was 4 but wasn't thorough enough.
//   August 5 2012: SpookyV2 (different results)
// 
// Up to 3 bytes/cycle for long messages.  Reasonably fast for short messages.
// All 1 or 2 bit deltas achieve avalanche within 1% bias per output bit.
//
// This was developed for and tested on 64-bit x86-compatible processors.
// It assumes the processor is little-endian.  There is a macro
// controlling whether unaligned reads are allowed (by default they are).
// This should be an equally good hash on big-endian machines, but it will
// compute different results on them than on little-endian machines.
//
// Google's CityHash has similar specs to SpookyHash, and CityHash is faster
// on new Intel boxes.  MD4 and MD5 also have similar specs, but they are orders
// of magnitude slower.  CRCs are two or more times slower, but unlike 
// SpookyHash, they have nice math for combining the CRCs of pieces to form 
// the CRCs of wholes.  There are also cryptographic hashes, but those are even 
// slower than MD5.
//

#include <stddef.h>

#ifdef _MSC_VER
# define INLINE __forceinline
  typedef  unsigned __int64 uint64;
  typedef  unsigned __int32 uint32;
  typedef  unsigned __int16 uint16;
  typedef  unsigned __int8  uint8;
#else
# include <stdint.h>
# define INLINE inline
  typedef  uint64_t  uint64;
  typedef  uint32_t  uint32;
  typedef  uint16_t  uint16;
  typedef  uint8_t   uint8;
#endif


class SpookyHash
{
public:
    //
    // SpookyHash: hash a single message in one call, produce 128-bit output
    //
    static void Hash128(
        const void *message,  // message to hash
        size_t length,        // length of message in bytes
        uint64 *hash1,        // in/out: in seed 1, out hash value 1
        uint64 *hash2);       // in/out: in seed 2, out hash value 2

    //
    // Hash64: hash a single message in one call, return 64-bit output
    //
    static uint64 Hash64(
        const void *message,  // message to hash
        size_t length,        // length of message in bytes
        uint64 seed)          // seed
    {
        uint64 hash1 = seed;
        Hash128(message, length, &hash1, &seed);
        return hash1;
    }

    //
    // Hash32: hash a single message in one call, produce 32-bit output
    //
    static uint32 Hash32(
        const void *message,  // message to hash
        size_t length,        // length of message in bytes
        uint32 seed)          // seed
    {
        uint64 hash1 = seed, hash2 = seed;
        Hash128(message, length, &hash1, &hash2);
        return (uint32)hash1;
    }

    //
    // Init: initialize the context of a SpookyHash
    //
    void Init(
        uint64 seed1,       // any 64-bit value will do, including 0
        uint64 seed2);      // different seeds produce independent hashes
    
    //
    // Update: add a piece of a message to a SpookyHash state
    //
    void Update(
        const void *message,  // message fragment
        size_t length);       // length of message fragment in bytes


    //
    // Final: compute the hash for the current SpookyHash state
    //
    // This does not modify the state; you can keep updating it afterward
    //
    // The result is the same as if SpookyHash() had been called with
    // all the pieces concatenated into one message.
    //
    void Final(
        uint64 *hash1,    // out only: first 64 bits of hash value.
        uint64 *hash2);   // out only: second 64 bits of hash value.

    //
    // left rotate a 64-bit value by k bytes
    //
    static INLINE uint64 Rot64(uint64 x, int k)
    {
        return (x << k) | (x >> (64 - k));
    }

    //
    // This is used if the input is 96 bytes long or longer.
    //
    // The internal state is fully overwritten every 96 bytes.
    // Every input bit appears to cause at least 128 bits of entropy
    // before 96 other bytes are combined, when run forward or backward
    //   For every input bit,
    //   Two inputs differing in just that input bit
    //   Where "differ" means xor or subtraction
    //   And the base value is random
    //   When run forward or backwards one Mix
    // I tried 3 pairs of each; they all differed by at least 212 bits.
    //
    static INLINE void Mix(
        const uint64 *data, 
        uint64 &s0, uint64 &s1, uint64 &s2, uint64 &s3,
        uint64 &s4, uint64 &s5, uint64 &s6, uint64 &s7,
        uint64 &s8, uint64 &s9, uint64 &s10,uint64 &s11)
    {
      s0 += data[0];    s2 ^= s10;    s11 ^= s0;    s0 = Rot64(s0,11);    s11 += s1;
      s1 += data[1];    s3 ^= s11;    s0 ^= s1;    s1 = Rot64(s1,32);    s0 += s2;
      s2 += data[2];    s4 ^= s0;    s1 ^= s2;    s2 = Rot64(s2,43);    s1 += s3;
      s3 += data[3];    s5 ^= s1;    s2 ^= s3;    s3 = Rot64(s3,31);    s2 += s4;
      s4 += data[4];    s6 ^= s2;    s3 ^= s4;    s4 = Rot64(s4,17);    s3 += s5;
      s5 += data[5];    s7 ^= s3;    s4 ^= s5;    s5 = Rot64(s5,28);    s4 += s6;
      s6 += data[6];    s8 ^= s4;    s5 ^= s6;    s6 = Rot64(s6,39);    s5 += s7;
      s7 += data[7];    s9 ^= s5;    s6 ^= s7;    s7 = Rot64(s7,57);    s6 += s8;
      s8 += data[8];    s10 ^= s6;    s7 ^= s8;    s8 = Rot64(s8,55);    s7 += s9;
      s9 += data[9];    s11 ^= s7;    s8 ^= s9;    s9 = Rot64(s9,54);    s8 += s10;
      s10 += data[10];    s0 ^= s8;    s9 ^= s10;    s10 = Rot64(s10,22);    s9 += s11;
      s11 += data[11];    s1 ^= s9;    s10 ^= s11;    s11 = Rot64(s11,46);    s10 += s0;
    }

    //
    // Mix all 12 inputs together so that h0, h1 are a hash of them all.
    //
    // For two inputs differing in just the input bits
    // Where "differ" means xor or subtraction
    // And the base value is random, or a counting value starting at that bit
    // The final result will have each bit of h0, h1 flip
    // For every input bit,
    // with probability 50 +- .3%
    // For every pair of input bits,
    // with probability 50 +- 3%
    //
    // This does not rely on the last Mix() call having already mixed some.
    // Two iterations was almost good enough for a 64-bit result, but a
    // 128-bit result is reported, so End() does three iterations.
    //
    static INLINE void EndPartial(
        uint64 &h0, uint64 &h1, uint64 &h2, uint64 &h3,
        uint64 &h4, uint64 &h5, uint64 &h6, uint64 &h7, 
        uint64 &h8, uint64 &h9, uint64 &h10,uint64 &h11)
    {
        h11+= h1;    h2 ^= h11;   h1 = Rot64(h1,44);
        h0 += h2;    h3 ^= h0;    h2 = Rot64(h2,15);
        h1 += h3;    h4 ^= h1;    h3 = Rot64(h3,34);
        h2 += h4;    h5 ^= h2;    h4 = Rot64(h4,21);
        h3 += h5;    h6 ^= h3;    h5 = Rot64(h5,38);
        h4 += h6;    h7 ^= h4;    h6 = Rot64(h6,33);
        h5 += h7;    h8 ^= h5;    h7 = Rot64(h7,10);
        h6 += h8;    h9 ^= h6;    h8 = Rot64(h8,13);
        h7 += h9;    h10^= h7;    h9 = Rot64(h9,38);
        h8 += h10;   h11^= h8;    h10= Rot64(h10,53);
        h9 += h11;   h0 ^= h9;    h11= Rot64(h11,42);
        h10+= h0;    h1 ^= h10;   h0 = Rot64(h0,54);
    }

    static INLINE void End(
        const uint64 *data, 
        uint64 &h0, uint64 &h1, uint64 &h2, uint64 &h3,
        uint64 &h4, uint64 &h5, uint64 &h6, uint64 &h7, 
        uint64 &h8, uint64 &h9, uint64 &h10,uint64 &h11)
    {
        h0 += data[0];   h1 += data[1];   h2 += data[2];   h3 += data[3];
        h4 += data[4];   h5 += data[5];   h6 += data[6];   h7 += data[7];
        h8 += data[8];   h9 += data[9];   h10 += data[10]; h11 += data[11];
        EndPartial(h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
        EndPartial(h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
        EndPartial(h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
    }

    //
    // The goal is for each bit of the input to expand into 128 bits of 
    //   apparent entropy before it is fully overwritten.
    // n trials both set and cleared at least m bits of h0 h1 h2 h3
    //   n: 2   m: 29
    //   n: 3   m: 46
    //   n: 4   m: 57
    //   n: 5   m: 107
    //   n: 6   m: 146
    //   n: 7   m: 152
    // when run forwards or backwards
    // for all 1-bit and 2-bit diffs
    // with diffs defined by either xor or subtraction
    // with a base of all zeros plus a counter, or plus another bit, or random
    //
    static INLINE void ShortMix(uint64 &h0, uint64 &h1, uint64 &h2, uint64 &h3)
    {
        h2 = Rot64(h2,50);  h2 += h3;  h0 ^= h2;
        h3 = Rot64(h3,52);  h3 += h0;  h1 ^= h3;
        h0 = Rot64(h0,30);  h0 += h1;  h2 ^= h0;
        h1 = Rot64(h1,41);  h1 += h2;  h3 ^= h1;
        h2 = Rot64(h2,54);  h2 += h3;  h0 ^= h2;
        h3 = Rot64(h3,48);  h3 += h0;  h1 ^= h3;
        h0 = Rot64(h0,38);  h0 += h1;  h2 ^= h0;
        h1 = Rot64(h1,37);  h1 += h2;  h3 ^= h1;
        h2 = Rot64(h2,62);  h2 += h3;  h0 ^= h2;
        h3 = Rot64(h3,34);  h3 += h0;  h1 ^= h3;
        h0 = Rot64(h0,5);   h0 += h1;  h2 ^= h0;
        h1 = Rot64(h1,36);  h1 += h2;  h3 ^= h1;
    }

    //
    // Mix all 4 inputs together so that h0, h1 are a hash of them all.
    //
    // For two inputs differing in just the input bits
    // Where "differ" means xor or subtraction
    // And the base value is random, or a counting value starting at that bit
    // The final result will have each bit of h0, h1 flip
    // For every input bit,
    // with probability 50 +- .3% (it is probably better than that)
    // For every pair of input bits,
    // with probability 50 +- .75% (the worst case is approximately that)
    //
    static INLINE void ShortEnd(uint64 &h0, uint64 &h1, uint64 &h2, uint64 &h3)
    {
        h3 ^= h2;  h2 = Rot64(h2,15);  h3 += h2;
        h0 ^= h3;  h3 = Rot64(h3,52);  h0 += h3;
        h1 ^= h0;  h0 = Rot64(h0,26);  h1 += h0;
        h2 ^= h1;  h1 = Rot64(h1,51);  h2 += h1;
        h3 ^= h2;  h2 = Rot64(h2,28);  h3 += h2;
        h0 ^= h3;  h3 = Rot64(h3,9);   h0 += h3;
        h1 ^= h0;  h0 = Rot64(h0,47);  h1 += h0;
        h2 ^= h1;  h1 = Rot64(h1,54);  h2 += h1;
        h3 ^= h2;  h2 = Rot64(h2,32);  h3 += h2;
        h0 ^= h3;  h3 = Rot64(h3,25);  h0 += h3;
        h1 ^= h0;  h0 = Rot64(h0,63);  h1 += h0;
    }
    
private:

    //
    // Short is used for messages under 192 bytes in length
    // Short has a low startup cost, the normal mode is good for long
    // keys, the cost crossover is at about 192 bytes.  The two modes were
    // held to the same quality bar.
    // 
    static void Short(
        const void *message,  // message (array of bytes, not necessarily aligned)
        size_t length,        // length of message (in bytes)
        uint64 *hash1,        // in/out: in the seed, out the hash value
        uint64 *hash2);       // in/out: in the seed, out the hash value

    // number of uint64's in internal state
    static const size_t sc_numVars = 12;

    // size of the internal state
    static const size_t sc_blockSize = sc_numVars*8;

    // size of buffer of unhashed data, in bytes
    static const size_t sc_bufSize = 2*sc_blockSize;

    //
    // sc_const: a constant which:
    //  * is not zero
    //  * is odd
    //  * is a not-very-regular mix of 1's and 0's
    //  * does not need any other special mathematical properties
    //
    static const uint64 sc_const = 0xdeadbeefdeadbeefLL;

    uint64 m_data[2*sc_numVars];   // unhashed data, for partial messages
    uint64 m_state[sc_numVars];  // internal state of the hash
    size_t m_length;             // total length of the input so far
    uint8  m_remainder;          // length of unhashed data stashed in m_data
};


================================================
FILE: benchmarks/bbs-prng.h
================================================
/* Copyright (c) 2016 Vladimir Makarov <vmakarov@gcc.gnu.org>

   Permission is hereby granted, free of charge, to any person
   obtaining a copy of this software and associated documentation
   files (the "Software"), to deal in the Software without
   restriction, including without limitation the rights to use, copy,
   modify, merge, publish, distribute, sublicense, and/or sell copies
   of the Software, and to permit persons to whom the Software is
   furnished to do so, subject to the following conditions:

   The above copyright notice and this permission notice shall be
   included in all copies or substantial portions of the Software.

   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   SOFTWARE.
*/

/* Blum-Blum-Shub Pseudo Random Number Generator (PRNG).  It is a
   crypto level PRNG: asymptotically the prediction of the next
   generated number is NP-complete task as finding the solution is
   equivalent to solving quadratic residue modulo N problem.

   The PRNG equation is simple x[n+1] = (x[n] * x[n]) mod N, where N
   is a product of two large prime numbers.

   The PRNG finds the two prime numbers during the PRNG
   initialization.  The prime number search is a probabilistic one.
   The numbers might be composite but the probability of this is very
   small 4^(-100).

   Working with big numbers are implemented by GMP.  So you need link
   a GMP Library (-lgmp) if you are going to use the PRNG.

   To use a generator call `init_bbs_prng` first, then call
   `get_bbs_prn` as much as you want to get a new PRN.  At the end of
   the PRNG use, call `finish_bbs_prng`.  You can change the default
   seed by calling set_bbs_seed.

   The PRNG passes NIST Statistical Test Suite for Random and
   Pseudorandom Number Generators for Cryptographic Applications
   (version 2.2.1) with 1000 bitstreams each containing 1M bits.

   The generation of a new number takes about 73K CPU cycles on x86_64
   (Intel 4.2GHz i7-4790K), or speed of the generation is about 58K
   numbers per sec. */

#ifndef __BBS_PRNG__
#define __BBS_PRNG__

#ifdef _MSC_VER
typedef unsigned __int32 uint32_t;
typedef unsigned __int64 uint64_t;
#else
#include <stdint.h>
#endif

#include <stdlib.h>
#include <gmp.h>

/* What size numbers should we use to make the next number prediction
   hard and how hard the prediction can be for a given number is a
   tricky question.  Please, read the Blum Blum Shub article and
   numerous discussion on the Internet.  I believe the default value
   is good for my purposes.  */
#ifndef BBS_PRIME_BITS
#define BBS_PRIME_BITS 512
#endif

/* BBS N which is a factor of two primes and the last generated pseudo
   random number which also can be considered as the current state of
   the PRNG.  */
static MP_INT _BBS_N, _BBS_xn;

/* Set up _BBS_N and _BBS_xn. */
static inline void
init_bbs_prng (void) {
  int i, n;
  MP_INT start;
  
  mpz_init (&start);
  mpz_init (&_BBS_N);
  mpz_init (&_BBS_xn);
  for (n = 0; n != 3;) {
    mpz_set_ui (&start, 0);
    for (i = 0; i < BBS_PRIME_BITS; i++) {
      mpz_mul_ui (&start, &start, 2);
      mpz_add_ui (&start, &start, rand () % 2);
    }
    if (mpz_probab_prime_p (&start, 100) == 0
	/* The following is BBS requirement to have only one solution
	   for the quadratic residue equation.  */
	|| mpz_tdiv_ui (&start, 4) != 3)
      continue;
    if (n == 0) mpz_set (&_BBS_N, &start);
    else if (n == 1) mpz_mul (&_BBS_N, &_BBS_N, &start);
    else mpz_set (&_BBS_xn, &start);
    n++;
  }
  mpz_clear (&start);
}

/* Make _BBS_xn equal to SEED.  */
static inline void
set_bbs_seed (uint32_t seed) {
  mpz_set_ui (&_BBS_xn, seed);
}


/* Update _BBS_xn.  */
static inline void
_update_bbs_prng (void) {
  mpz_mul (&_BBS_xn, &_BBS_xn, &_BBS_xn);
  mpz_tdiv_r (&_BBS_xn, &_BBS_xn, &_BBS_N);
}

/* Return the next pseudo random number.  */
static inline uint64_t
get_bbs_prn (void) {
  int i;
  uint64_t res = 0;
  
  for (i = 0; i < 32; i++) {
    _update_bbs_prng ();
    /* We conservatively use only 2 LS bits.  Depending of
       BBS_PRIME_BITS value it could be more.  */
    res = res << 2 | mpz_tdiv_ui (&_BBS_xn, 4);
  }
  return res;
}

/* Finish work with the PRNG.  */
static inline void
finish_bbs_prng (void) {
  mpz_clear (&_BBS_N);
  mpz_clear (&_BBS_xn);
}

#endif


================================================
FILE: benchmarks/bench-crypto.c
================================================
#if defined(SHA2)

#include "sha512.h"
void sha512_test (const void *msg, int len, void *out) {
  sha512_ctx ctx;
  
  rhash_sha512_init (&ctx);
  rhash_sha512_update (&ctx, msg, len);
  rhash_sha512_final (&ctx, out);
}

#define test sha512_test

#elif defined(SHA3)

#include "sha3.h"
void sha3_512_test (const void *msg, int len, void *out) {
  sha3_ctx ctx;
  
  rhash_sha3_512_init (&ctx);
  rhash_sha3_update (&ctx, msg, len);
  rhash_sha3_final (&ctx, out);
}

#define test sha3_512_test

#elif defined(BLAKE2B)

#include "blake2.h"
void blake2b_test (const void *msg, int len, void *out) {
  static const uint64_t key[4] = {0, 0, 0, 0};
  blake2b ((uint8_t *) out, msg, key, 64, len, 32);
}

#define test blake2b_test

#elif defined(MUM512)

#include "mum512.h"

void mum512_test (const void *msg, int len, void *out) {
  mum512_hash (msg, len, out);
}

#define test mum512_test

#else
#error "I don't know what to test"
#endif

static const char msg[] =
"SATURDAY morning was come, and all the summer world was bright and\n\
fresh, and brimming with life. There was a song in every heart; and if\n\
the heart was young the music issued at the lips. There was cheer in\n\
every face and a spring in every step. The locust-trees were in bloom\n\
and the fragrance of the blossoms filled the air. Cardiff Hill, beyond\n\
the village and above it, was green with vegetation and it lay just far\n\
enough away to seem a Delectable Land, dreamy, reposeful, and inviting.\n\
\n\
Tom appeared on the sidewalk with a bucket of whitewash and a\n\
long-handled brush. He surveyed the fence, and all gladness left him and\n\
a deep melancholy settled down upon his spirit. Thirty yards of board\n\
fence nine feet high. Life to him seemed hollow, and existence but a\n\
burden. Sighing, he dipped his brush and passed it along the topmost\n\
plank; repeated the operation; did it again; compared the insignificant\n\
whitewashed streak with the far-reaching continent of unwhitewashed\n\
fence, and sat down on a tree-box discouraged. Jim came skipping out at\n\
the gate with a tin pail, and singing Buffalo Gals. Bringing water from\n\
the town pump had always been hateful work in Tom's eyes, before, but\n\
now it did not strike him so. He remembered that there was company at\n\
the pump. White, mulatto, and negro boys and girls were always there\n\
waiting their turns, resting, trading playthings, quarrelling, fighting,\n\
skylarking. And he remembered that although the pump was only a hundred\n\
and fifty yards off, Jim never got back with a bucket of water under an\n\
hour--and even then somebody generally had to go after him. Tom said:\n\
\n\
\"Say, Jim, I'll fetch the water if you'll whitewash some.\"\n\
\n\
Jim shook his head and said:\n\
\n\
\"Can't, Mars Tom. Ole missis, she tole me I got to go an' git dis water\n\
an' not stop foolin' roun' wid anybody. She say she spec' Mars Tom gwine\n\
to ax me to whitewash, an' so she tole me go 'long an' 'tend to my own\n\
business--she 'lowed _she'd_ 'tend to de whitewashin'.\"\n\
\n\
\"Oh, never you mind what she said, Jim. That's the way she always talks.\n\
Gimme the bucket--I won't be gone only a a minute. _She_ won't ever\n\
know.\"\n\
\n\
\"Oh, I dasn't, Mars Tom. Ole missis she'd take an' tar de head off'n me.\n\
'Deed she would.\"\n\
\n\
\"_She_! She never licks anybody--whacks 'em over the head with her\n\
thimble--and who cares for that, I'd like to know. She talks awful, but\n\
talk don't hurt--anyways it don't if she don't cry. Jim, I'll give you a\n\
marvel. I'll give you a white alley!\"\n\
\n\
Jim began to waver.\n\
\n\
\"White alley, Jim! And it's a bully taw.\"\n\
\n\
\"My! Dat's a mighty gay marvel, I tell you! But Mars Tom I's powerful\n\
'fraid ole missis--\"\n\
\n\
\"And besides, if you will I'll show you my sore toe.\"\n\
\n\
Jim was only human--this attraction was too much for him. He put down\n\
his pail, took the white alley, and bent over the toe with absorbing\n\
interest while the bandage was being unwound. In another moment he\n\
was flying down the street with his pail and a tingling rear, Tom was\n\
whitewashing with vigor, and Aunt Polly was retiring from the field with\n\
a slipper in her hand and triumph in her eye.\n\
\n\
But Tom's energy did not last. He began to think of the fun he had\n\
planned for this day, and his sorrows multiplied. Soon the free boys\n\
would come tripping along on all sorts of delicious expeditions, and\n\
they would make a world of fun of him for having to work--the very\n\
thought of it burnt him like fire. He got out his worldly wealth and\n\
examined it--bits of toys, marbles, and trash; enough to buy an exchange\n\
of _work_, maybe, but not half enough to buy so much as half an hour\n\
of pure freedom. So he returned his straitened means to his pocket, and\n\
gave up the idea of trying to buy the boys. At this dark and hopeless\n\
moment an inspiration burst upon him! Nothing less than a great,\n\
magnificent inspiration.\n\
\n\
He took up his brush and went tranquilly to work. Ben Rogers hove in\n\
sight presently--the very boy, of all boys, whose ridicule he had been\n\
dreading. Ben's gait was the hop-skip-and-jump--proof enough that his\n\
heart was light and his anticipations high. He was eating an apple, and\n\
giving a long, melodious whoop, at intervals, followed by a deep-toned\n\
ding-dong-dong, ding-dong-dong, for he was personating a steamboat. As\n\
he drew near, he slackened speed, took the middle of the street, leaned\n\
far over to starboard and rounded to ponderously and with laborious pomp\n\
and circumstance--for he was personating the Big Missouri, and considered\n\
himself to be drawing nine feet of water. He was boat and captain and\n\
engine-bells combined, so he had to imagine himself standing on his own\n\
hurricane-deck giving the orders and executing them:\n\
\n\
\"Stop her, sir! Ting-a-ling-ling!\" The headway ran almost out, and he\n\
drew up slowly toward the sidewalk.\n\
\n\
\"Ship up to back! Ting-a-ling-ling!\" His arms straightened and stiffened\n\
down his sides.\n\
\n\
\"Set her back on the stabboard! Ting-a-ling-ling! Chow! ch-chow-wow!\n\
Chow!\" His right hand, mean-time, describing stately circles--for it was\n\
representing a forty-foot wheel.\n\
\n\
\"Let her go back on the labboard! Ting-a-ling-ling! Chow-ch-chow-chow!\"\n\
The left hand began to describe circles.\n\
\n\
\"Stop the stabboard! Ting-a-ling-ling! Stop the labboard! Come ahead on\n\
the stabboard! Stop her! Let your outside turn over slow! Ting-a-ling-ling!\n\
Chow-ow-ow! Get out that head-line! _lively_ now! Come--out with\n\
your spring-line--what're you about there! Take a turn round that stump\n\
with the bight of it! Stand by that stage, now--let her go! Done with\n\
the engines, sir! Ting-a-ling-ling! SH'T! S'H'T! SH'T!\" (trying the\n\
gauge-cocks).\n\
\n\
Tom went on whitewashing--paid no attention to the steamboat. Ben stared\n\
a moment and then said: \"_Hi-Yi! You're_ up a stump, ain't you!\"\n\
\n\
No answer. Tom surveyed his last touch with the eye of an artist, then\n\
he gave his brush another gentle sweep and surveyed the result, as\n\
before. Ben ranged up alongside of him. Tom's mouth watered for the\n\
apple, but he stuck to his work. Ben said:\n\
\n\
\"Hello, old chap, you got to work, hey?\"\n\
\n\
Tom wheeled suddenly and said:\n\
\n\
\"Why, it's you, Ben! I warn't noticing.\"\n\
\n\
\"Say--I'm going in a-swimming, I am. Don't you wish you could? But of\n\
course you'd druther _work_--wouldn't you? Course you would!\"\n\
\n\
Tom contemplated the boy a bit, and said:\n\
\n\
\"What do you call work?\"\n\
\n\
\"Why, ain't _that_ work?\"\n\
\n\
Tom resumed his whitewashing, and answered carelessly:\n\
\n\
\"Well, maybe it is, and maybe it ain't. All I know, is, it suits Tom\n\
Sawyer.\"\n\
\n\
\"Oh come, now, you don't mean to let on that you _like_ it?\"\n\
\n\
The brush continued to move.\n\
\n\
\"Like it? Well, I don't see why I oughtn't to like it. Does a boy get a\n\
chance to whitewash a fence every day?\"\n\
\n\
That put the thing in a new light. Ben stopped nibbling his apple.\n\
Tom swept his brush daintily back and forth--stepped back to note the\n\
effect--added a touch here and there--criticised the effect again--Ben\n\
watching every move and getting more and more interested, more and more\n\
absorbed. Presently he said:\n\
\n\
\"Say, Tom, let _me_ whitewash a little.\"\n\
\n\
Tom considered, was about to consent; but he altered his mind:\n\
\n\
\"No--no--I reckon it wouldn't hardly do, Ben. You see, Aunt Polly's awful\n\
particular about this fence--right here on the street, you know--but if it\n\
was the back fence I wouldn't mind and _she_ wouldn't. Yes, she's awful\n\
particular about this fence; it's got to be done very careful; I reckon\n\
there ain't one boy in a thousand, maybe two thousand, that can do it\n\
the way it's got to be done.\"\n\
\n\
\"No--is that so? Oh come, now--lemme just try. Only just a little--I'd let\n\
_you_, if you was me, Tom.\"\n\
\n\
\"Ben, I'd like to, honest injun; but Aunt Polly--well, Jim wanted to do\n\
it, but she wouldn't let him; Sid wanted to do it, and she wouldn't let\n\
Sid. Now don't you see how I'm fixed? If you was to tackle this fence\n\
and anything was to happen to it--\"\n\
\n\
\"Oh, shucks, I'll be just as careful. Now lemme try. Say--I'll give you\n\
the core of my apple.\"\n\
\n\
\"Well, here--No, Ben, now don't. I'm afeard--\"\n\
\n\
\"I'll give you _all_ of it!\"\n\
\n\
Tom gave up the brush with reluctance in his face, but alacrity in his\n\
heart. And while the late steamer Big Missouri worked and sweated in the\n\
sun, the retired artist sat on a barrel in the shade close by,\n\
dangled his legs, munched his apple, and planned the slaughter of more\n\
innocents. There was no lack of material; boys happened along every\n\
little while; they came to jeer, but remained to whitewash. By the time\n\
Ben was fagged out, Tom had traded the next chance to Billy Fisher for\n\
a kite, in good repair; and when he played out, Johnny Miller bought in\n\
for a dead rat and a string to swing it with--and so on, and so on, hour\n\
after hour. And when the middle of the afternoon came, from being a\n\
poor poverty-stricken boy in the morning, Tom was literally rolling in\n\
wealth. He had besides the things before mentioned, twelve marbles, part\n\
of a jews-harp, a piece of blue bottle-glass to look through, a spool\n\
cannon, a key that wouldn't unlock anything, a fragment of chalk, a\n\
glass stopper of a decanter, a tin soldier, a couple of tadpoles,\n\
six fire-crackers, a kitten with only one eye, a brass door-knob, a\n\
dog-collar--but no dog--the handle of a knife, four pieces of orange-peel,\n\
and a dilapidated old window sash.\n\
\n\
He had had a nice, good, idle time all the while--plenty of company--and\n\
the fence had three coats of whitewash on it! If he hadn't run out of\n\
whitewash he would have bankrupted every boy in the village.\n\
\n\
Tom said to himself that it was not such a hollow world, after all. He\n\
had discovered a great law of human action, without knowing it--namely,\n\
that in order to make a man or a boy covet a thing, it is only necessary\n\
to make the thing difficult to attain. If he had been a great and\n\
wise philosopher, like the writer of this book, he would now have\n\
comprehended that Work consists of whatever a body is _obliged_ to do,\n\
and that Play consists of whatever a body is not obliged to do. And\n\
this would help him to understand why constructing artificial flowers or\n\
performing on a tread-mill is work, while rolling ten-pins or climbing\n\
Mont Blanc is only amusement. There are wealthy gentlemen in England\n\
who drive four-horse passenger-coaches twenty or thirty miles on a\n\
daily line, in the summer, because the privilege costs them considerable\n\
money; but if they were offered wages for the service, that would turn\n\
it into work and then they would resign.\n\
\n\
The boy mused awhile over the substantial change which had taken place\n\
in his worldly circumstances, and then wended toward headquarters to\n\
report.\n\
";

#include <stdlib.h>
#include <stdio.h>
static void
print512 (unsigned char digest[64]) {
  int i;
  
  for (i = 0; i < 64; i++)
    printf ("%x", digest[i]);
  printf ("\n");
}

#ifdef SPEED1
int main () {
  int i; uint64_t k = rand (); unsigned char out[64];
  
  for (i = 0; i < 2000000; i++)
    test (msg, 10, &out);
  printf ("10 byte: %s:", (size_t)&k & 0x7 ? "unaligned" : "aligned");
  print512 (out);
  return 0;
}
#endif

#ifdef SPEED2
int main () {
  int i; uint64_t k = rand (); unsigned char out[64];
  
  for (i = 0; i < 2000000; i++)
    test (msg, 100, &out);
  printf ("100 byte: %s:", (size_t)&k & 0x7 ? "unaligned" : "aligned");
  print512 (out);
  return 0;
}
#endif

#ifdef SPEED3
int main () {
  int i; uint64_t k = rand (); unsigned char out[64];
  
  for (i = 0; i < 2000000; i++)
    test (msg, 1000, &out);
  printf ("1000 byte: %s:", (size_t)&k & 0x7 ? "unaligned" : "aligned");
  print512 (out);
  return 0;
}
#endif

#ifdef SPEED4
int main () {
int i; uint64_t k = rand (); unsigned char out[64];

  for (i = 0; i < 500000; i++)
    test (msg, 10000, &out);
  printf ("10000 byte: %s:", (size_t)&k & 0x7 ? "unaligned" : "aligned");
  print512 (out);
  return 0;
}
#endif


================================================
FILE: benchmarks/bench-crypto.sh
================================================
#!/bin/bash

# Benchmarking different crypto hash functions.

IFS='%'
temp=__temp


print() {
    s=`grep -E 'user[ 	]*[0-9]' $2 | sed s/.*user// | sed s/\\t//`
    echo $1 "$s"s
}

echo compiling sha512
gcc -O3 -w -c sha512.c byte_order.c || exit 1
echo compiling sha3
gcc -O3 -w -c sha3.c || exit 1

str86_64="`uname -a|grep x86_64`"
if test -n str86_64; then
    echo compiling blake2b
    gcc -O3 -w -c -I. -std=gnu99 blake2b.c || exit 1
fi

echo +++10-byte speed '(20M texts)':
gcc -DSPEED1 -O3 -w -DSHA2 sha512.o byte_order.o bench-crypto.c && (time -p ./a.out) >$temp 2>&1 && print "sha512:" $temp
gcc -DSPEED1 -O3 -w -DSHA3 sha3.o byte_order.o bench-crypto.c && (time -p ./a.out) >$temp 2>&1 && print "sha3  :" $temp
if test -n str86_64; then
    gcc -DSPEED1 -O3 -w -DBLAKE2B blake2b.o bench-crypto.c && (time -p ./a.out) >$temp 2>&1 && print "blake2:" $temp
fi
gcc -DSPEED1 -O3 -w -DMUM512 -I../ bench-crypto.c && (time -p ./a.out) >$temp 2>&1 && print "mum512:" $temp

echo +++100-byte speed '(20M texts)':
gcc -DSPEED2 -O3 -w -DSHA2 sha512.o byte_order.o bench-crypto.c && (time -p ./a.out) >$temp 2>&1 && print "sha512:" $temp
gcc -DSPEED2 -O3 -w -DSHA3 sha3.o byte_order.o bench-crypto.c && (time -p ./a.out) >$temp 2>&1 && print "sha3  :" $temp
if test -n str86_64; then
    gcc -DSPEED2 -O3 -w -DBLAKE2B blake2b.o bench-crypto.c && (time -p ./a.out) >$temp 2>&1 && print "blake2:" $temp
fi
gcc -DSPEED2 -O3 -w -DMUM512 -I../ bench-crypto.c && (time -p ./a.out) >$temp 2>&1 && print "mum512:" $temp

echo +++1000-byte speed '(20M texts)':
gcc -DSPEED3 -O3 -w -DSHA2 sha512.o byte_order.o bench-crypto.c && (time -p ./a.out) >$temp 2>&1 && print "sha512:" $temp
gcc -DSPEED3 -O3 -w -DSHA3 sha3.o byte_order.o bench-crypto.c && (time -p ./a.out) >$temp 2>&1 && print "sha3  :" $temp
if test -n str86_64; then
    gcc -DSPEED3 -O3 -w -DBLAKE2B blake2b.o bench-crypto.c && (time -p ./a.out) >$temp 2>&1 && print "blake2:" $temp
fi
gcc -DSPEED3 -O3 -w -DMUM512 -I../ bench-crypto.c && (time -p ./a.out) >$temp 2>&1 && print "mum512:" $temp

echo +++10000-byte speed '(5M texts)':
gcc -DSPEED4 -O3 -w -DSHA2 sha512.o byte_order.o bench-crypto.c && (time -p ./a.out) >$temp 2>&1 && print "sha512:" $temp
gcc -DSPEED4 -O3 -w -DSHA3 sha3.o byte_order.o bench-crypto.c && (time -p ./a.out) >$temp 2>&1 && print "sha3  :" $temp
if test -n str86_64; then
    gcc -DSPEED4 -O3 -w -DBLAKE2B blake2b.o bench-crypto.c && (time -p ./a.out) >$temp 2>&1 && print "blake2:" $temp
fi
gcc -DSPEED4 -O3 -w -DMUM512 -I../ bench-crypto.c && (time -p ./a.out) >$temp 2>&1 && print "mum512:" $temp

rm -rf ./a.out $temp sha512.o sha3.o byte_order.o blake2b.o


================================================
FILE: benchmarks/bench-prng.c
================================================
#define N1 100000
#if defined(BBS)
#include "bbs-prng.h"
#define N2 2
static void init_prng (void) { init_bbs_prng (); }
static uint64_t get_prn (void) { return get_bbs_prn (); }
static void finish_prng (void) { finish_bbs_prng (); }
#elif defined(CHACHA)
#include "chacha-prng.h"
#define N2 5000
static void init_prng (void) { init_chacha_prng (); }
static uint64_t get_prn (void) { return get_chacha_prn (); }
static void finish_prng (void) { finish_chacha_prng (); }
#elif defined(SIP24)
#include "sip24-prng.h"
#define N2 5000
static void init_prng (void) { init_sip24_prng (); }
static uint64_t get_prn (void) { return get_sip24_prn (); }
static void finish_prng (void) { finish_sip24_prng (); }
#elif defined(MUM)
#include "mum-prng.h"
#define N2 30000
static void init_prng (void) { mum_hash_randomize (0); init_mum_prng (); }
static uint64_t get_prn (void) { return get_mum_prn (); }
static void finish_prng (void) { finish_mum_prng (); }
#elif defined(MUM512)
#include "mum512-prng.h"
#define N2 2000
static void init_prng (void) { init_mum512_prng (); }
static uint64_t get_prn (void) { return get_mum512_prn (); }
static void finish_prng (void) { finish_mum512_prng (); }
#elif defined(XOROSHIRO128P)
#include "xoroshiro128plus.c"
#define N2 30000
static void init_prng (void) { s[0] = 0xe220a8397b1dcdaf; s[1] = 0x6e789e6aa1b965f4; }
static uint64_t get_prn (void) { return next (); }
static void finish_prng (void) { }
#elif defined(XOROSHIRO128STARSTAR)
#include "xoroshiro128starstar.c"
#define N2 30000
static void init_prng (void) { s[0] = 0xe220a8397b1dcdaf; s[1] = 0x6e789e6aa1b965f4; }
static uint64_t get_prn (void) { return next (); }
static void finish_prng (void) { }
#elif defined(XOSHIRO256P)
#include "xoshiro256plus.c"
#define N2 30000
static void init_prng (void) { s[0] = 0xe220a8397b1dcdaf; s[1] = 0x6e789e6aa1b965f4; s[2] = 0x6c45d188009454f; s[3] = 0xf88bb8a8724c81ec; }
static uint64_t get_prn (void) { return next (); }
static void finish_prng (void) { }
#elif defined(XOSHIRO256STARSTAR)
#include "xoshiro256starstar.c"
#define N2 30000
static void init_prng (void) { s[0] = 0xe220a8397b1dcdaf; s[1] = 0x6e789e6aa1b965f4; s[2] = 0x6c45d188009454f; s[3] = 0xf88bb8a8724c81ec; }
static uint64_t get_prn (void) { return next (); }
static void finish_prng (void) { }
#elif defined(XOSHIRO512P)
#include "xoshiro512plus.c"
#define N2 30000
static void init_prng (void) { s[0] = 0xe220a8397b1dcdaf; s[1] = 0x6e789e6aa1b965f4; s[2] = 0x6c45d188009454f; s[3] = 0xf88bb8a8724c81ec; s[4] = 0x1b39896a51a8749b; s[5] = 0x53cb9f0c747ea2ea; s[6] = 0x2c829abe1f4532e1; s[7] = 0xc584133ac916ab3c; }
static uint64_t get_prn (void) { return next (); }
static void finish_prng (void) { }
#elif defined(XOSHIRO512STARSTAR)
#include "xoshiro512starstar.c"
#define N2 30000
static void init_prng (void) { s[0] = 0xe220a8397b1dcdaf; s[1] = 0x6e789e6aa1b965f4; s[2] = 0x6c45d188009454f; s[3] = 0xf88bb8a8724c81ec; s[4] = 0x1b39896a51a8749b; s[5] = 0x53cb9f0c747ea2ea; s[6] = 0x2c829abe1f4532e1; s[7] = 0xc584133ac916ab3c; }
static uint64_t get_prn (void) { return next (); }
static void finish_prng (void) { }
#elif defined(RAND)
#include <stdlib.h>
#include <stdint.h>
#define N2 7000
static void init_prng (void) { }
static uint64_t get_prn (void) { return rand (); }
static void finish_prng (void) { }
#endif

uint64_t dummy;
#include <stdio.h>
#include <time.h>

#ifdef OUTPUT
#include <stdint.h>

int main(void)
{
    init_prng ();
    while (1) {
        uint64_t value = get_prn();
        fwrite((void*) &value, sizeof(value), 1, stdout);
    }
}

#else
int main (void) {
  int i, j; double d; uint64_t res = 0;
  clock_t t = clock ();
  
  init_prng ();
  for (i = 0; i < N2; i++)
    for (j = 0; j < N1; j++)
      res ^= get_prn ();
  finish_prng ();
  t = clock () - t;
  d = (N1 + 0.0) * N2 * CLOCKS_PER_SEC / t / 1000;
  if (d > 1000)
    printf ("%7.2fM prns/sec\n", d / 1000);
  else
    printf ("%7.2fK prns/sec\n", d);
  dummy = res;
  return 0;
}
#endif


================================================
FILE: benchmarks/bench-prng.sh
================================================
#!/bin/bash

# Benchmarking different Pseudo Random Generators

echo +++pseudo random number generation speed '(PRNs/sec)':
if test x${MUM_ONLY} == x; then
    gcc -DBBS -O3 -w bench-prng.c -lgmp && echo -n 'BBS           : ' && ./a.out 2>&1
    gcc -DCHACHA -O3 -w bench-prng.c && echo -n 'ChaCha        : ' && ./a.out 2>&1
    gcc -DSIP24 -O3 -w bench-prng.c && echo -n 'Sip24         : ' && ./a.out 2>&1
fi
gcc -DMUM512 -DMUM512_ROUNDS=2 -I../ -O3 -w bench-prng.c && echo -n 'MUM512        : ' && ./a.out 2>&1
gcc -DMUM -I../ -O3 -w bench-prng.c && echo -n 'MUM           : ' && ./a.out 2>&1
if test x${MUM_ONLY} == x; then
    gcc -DXOROSHIRO128STARSTAR -I../ -std=c99 -O3 -w bench-prng.c && echo -n 'XOROSHIRO128**: ' && ./a.out 2>&1
    gcc -DXOSHIRO256STARSTAR -I../ -std=c99 -O3 -w bench-prng.c && echo -n 'XOSHIRO256**  : ' && ./a.out 2>&1
    gcc -DXOSHIRO512STARSTAR -I../ -std=c99 -O3 -w bench-prng.c && echo -n 'XOSHIRO512**  : ' && ./a.out 2>&1
    gcc -DRAND -I../ -O3 -w bench-prng.c && echo -n 'RAND          : ' && ./a.out 2>&1
    gcc -DXOROSHIRO128P -I../ -std=c99 -O3 -w bench-prng.c && echo -n 'XOROSHIRO128+ : ' && ./a.out 2>&1
    gcc -DXOSHIRO256P -I../ -std=c99 -O3 -w bench-prng.c && echo -n 'XOSHIRO256+   : ' && ./a.out 2>&1
    gcc -DXOSHIRO512P -I../ -std=c99 -O3 -w bench-prng.c && echo -n 'XOSHIRO512+   : ' && ./a.out 2>&1
fi

rm -rf ./a.out


================================================
FILE: benchmarks/bench.c
================================================
#if defined(Spooky)

#include "SpookyV2.h"
static void SpookyHash64_test (const void *key, int len, uint32_t seed, void *out) {
  *(uint64_t *) out = SpookyHash::Hash64 (key, len, seed);
}

#define test SpookyHash64_test
#define test64 test

#elif defined(City)

#include "City.h"
static void CityHash64_test (const void *key, int len, uint32_t seed, void *out) {
  *(uint64 *) out = CityHash64WithSeed ((const char *) key, len, seed);
}

#define test CityHash64_test
#define test64 test

#elif defined(SipHash)

#include <stdint.h>

extern int siphash (uint8_t *out, const uint8_t *in, uint64_t inlen, const uint8_t *k);

static void siphash_test (const void *key, int len, uint32_t seed, void *out) {
  uint64_t s[2];

  s[0] = seed;
  s[1] = 0;
  siphash (out, (const uint8_t *) key, len, (const uint8_t *) s);
}

#define test siphash_test
#define test64 test

#elif defined(xxHash)

#ifdef _MSC_VER
typedef unsigned __int32 uint32_t;
typedef unsigned __int64 uint64_t;
#else
#include <stdint.h>
#endif

#include "xxhash.c"

static void xxHash64_test (const void *key, int len, uint32_t seed, void *out) {
  *(uint64_t *) out = XXH64 (key, len, seed);
}

#define test xxHash64_test
#define test64 test

#elif defined(xxh3)

#include "xxh3.h"

static void xxh3_test (const void *key, int len, uint32_t seed, void *out) {
  *(uint64_t *) out = XXH3_64bits (key, len);
}

#define test xxh3_test
#define test64 test

#elif defined(T1HA2)

#include "t1ha.h"
static void t1ha_test (const void *key, int len, uint32_t seed, void *out) {
  *(uint64_t *) out = t1ha2_atonce (key, len, seed);
}

#define test t1ha_test
#define test64 test

#elif defined(City)

#include "City.h"
static void CityHash64_test (const void *key, int len, uint32_t seed, void *out) {
  *(uint64 *) out = CityHash64WithSeed ((const char *) key, len, seed);
}

#define test CityHash64_test
#define test64 test

#elif defined(METRO)

#include "metrohash64.h"
static void metro_test (const void *key, int len, uint32_t seed, void *out) {
  MetroHash64::Hash ((const uint8_t *) key, len, (uint8_t *) out, seed);
}

#define test metro_test
#define test64 test

#elif defined(MeowHash)

#ifdef _MSC_VER
typedef unsigned __int32 uint32_t;
typedef unsigned __int64 uint64_t;
#else
#include <stdint.h>
#endif

#include "meow_intrinsics.h"
#include "meow_hash.h"

static void meowhash_test (const void *key, int len, uint32_t seed, void *out) {
  *(uint64_t *) out = MeowU64From (MeowHash_Accelerated (seed, len, key), 0);
}

#define test meowhash_test
#define test64 test

#elif defined(MUM)

#include "mum.h"
static void mum_test (const void *key, int len, uint32_t seed, void *out) {
  *(uint64_t *) out = mum_hash (key, len, seed);
}

static void mum_test64 (const void *key, int len, uint32_t seed, void *out) {
  *(uint64_t *) out = mum_hash64 (*(uint64_t *) key, seed);
}

#define test mum_test
#define test64 mum_test64

#elif defined(VMUM)

#include "vmum.h"
static void mum_test (const void *key, int len, uint32_t seed, void *out) {
  *(uint64_t *) out = vmum_hash (key, len, seed);
}

static void mum_test64 (const void *key, int len, uint32_t seed, void *out) {
  *(uint64_t *) out = vmum_hash64 (*(uint64_t *) key, seed);
}

#define test mum_test
#define test64 mum_test64

#elif defined(RAPID)

#include "rapidhash.h"
static void rapid_test (const void *key, int len, uint32_t seed, void *out) {
  *(uint64_t *) out = rapidhash_withSeed (key, len, seed);
}

#define test rapid_test
#define test64 rapid_test

#else
#error "I don't know what to test"
#endif

#if DATA_LEN == 0

#include <stdlib.h>
#include <stdio.h>
uint32_t arr[16 * 256 * 1024];
int main () {
  int i;
  uint64_t out;

  for (i = 0; i < 16 * 256 * 1024; i++) {
    arr[i] = rand ();
  }
  for (i = 0; i < 10000; i++) test (arr, 16 * 256 * 1024 * 4, 2, &out), arr[0] = out;
  printf ("%s:%llx\n", (size_t) arr & 0x7 ? "unaligned" : "aligned", out);
  return 0;
}

#else

int len = DATA_LEN;
uint64_t k[(DATA_LEN + 7) / 8];
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
/* We should use external to prevent optimizations for MUM after
   inlining.  Otherwise MUM results will be too good.  */
int main () {
  int i, j, n;
  uint64_t out;

  assert (len <= 1024);
  printf ("%d-byte: %s:\n", len, (size_t) k & 0x7 ? "unaligned" : "aligned");
  for (i = 0; i < sizeof (k) / sizeof (uint64_t); i++) k[i] = i;
  for (j = 0; j < 128; j++)
    for (n = i = 0; i < 10000000; i++) test (k, len, 2, &out), k[0] = out;
  printf ("%llx\n", out);
  return 0;
}

#endif


================================================
FILE: benchmarks/bench.sh
================================================
#!/bin/bash

# Benchmarking different hash functions.

temp=__hash-temp.out
temp2=__hash-temp2.out
temp3=__hash-temp3.out

COPTFLAGS=${COPTFLAGS:--O3}
if test `uname -m` == x86_64; then
    COPTFLAGS=`echo $COPTFLAGS -march=native`
elif test `uname -m` == ppc64; then
    COPTFLAGS=`echo $COPTFLAGS -mcpu=native`
fi
LTO=${LTO:--flto}
CC=${CC:-cc}
CXX=${CXX:-c++}

echo Using ${CC} and ${CXX}

if test x${MUM_ONLY} == x; then
    echo compiling Spooky
    ${CXX} ${COPTFLAGS} ${LTO} -w -c SpookyV2.cpp || exit 1
    echo compiling City
    ${CXX} ${COPTFLAGS} ${LTO} -w -c City.cpp || exit 1
    echo compiling t1ha
    ${CC} ${COPTFLAGS} ${LTO} -w -c t1ha/src/t1ha*.c || exit 1
    echo compiling metrohash64
    ${CXX} ${COPTFLAGS} ${LTO} -w -c metrohash64.cpp || exit 1
    echo compiling SipHash24
    ${CC} ${COPTFLAGS} ${LTO} -w -c siphash24.c || exit 1
fi

rm -f $temp3

percent () {
    val=`awk "BEGIN {if ($2==0) print \"Inf\"; else printf \"%.2f\n\", $1/$2;}"`
    echo "$val"
    echo "$3:$val" >>$temp3
}

skip () {
    l=$1
    n=$2
    while test $l -le $n; do echo -n " "; l=`expr $l + 1`; done
}

print_time() {
    title="$1"
    secs=$2
    printf '%-.2f %5.2fs|' `percent $base_time $secs "$title"` $secs
}

TASKSET=""
if type taskset >/dev/null 2>&1;then TASKSET="taskset -c 0";fi
echo $TASKSET

run () {
  title=$1
  program=$2
  flag=$3
  ok=
  if (time -p $TASKSET $program) >$temp 2>$temp2; then
      ok=y
      (time -p $TASKSET $program) >$temp 2>>$temp2
      (time -p $TASKSET $program) >$temp 2>>$temp2
  fi
  if test x$ok = x;then echo $program: FAILED; return 1; fi
  secs=`grep -E 'user[ 	]*[0-9]' $temp2 | grep -F -v : | sed s/.*user// | sed s/\\t// | sort -n | head -1`
  if test x$flag != x;then base_time=$secs;fi
  print_time "$title" $secs
}

mach=`uname -m`
check_meow=`(test $mach == x86_64 || test $mach == aarch64) && echo yes`

check_meow=
check_xxHash=

echo -n '| Length    |  VMUM-V2  |  VMUM-V1  |  MUM-V4   |  MUM-V3   |  Spooky   |   City    |'
if test "$check_xxHash" == yes; then echo -n '  xxHash   |';fi
if test "$check_rapid" == yes; then echo -n '  Rapidh   |';fi
echo -n '  xxHash3  |   t1ha2   | SipHash24 |   Metro   |'
if test "$check_meow" == yes; then echo ' MeowHash  |'; else echo; fi
echo -n '|:----------|:---------:|:---------:|:---------:|:---------:|:---------:|:---------:|'
if test "$check_xxHash" == yes; then echo -n ':---------:|';fi
if test "$check_rapid" == yes; then echo -n ':---------:|';fi
echo -n ':---------:|:---------:|:---------:|:---------:|'
if test "$check_meow" == yes; then echo ':---------:|'; else echo; fi

for i in 3 4 5 6 7 8 9 10 11 12 13 14 15 16 32 64 96 128 192 256 512 1024 0;do
    if test $i == 0; then echo -n '| Bulk      |'; else printf '|%4d bytes |' $i;fi
    ${CXX} -DDATA_LEN=$i ${COPTFLAGS} -w -fpermissive -DVMUM -I../ bench.c && run "00vMUM-V2" "./a.out" first
    ${CXX} -DDATA_LEN=$i ${COPTFLAGS} -w -fpermissive -DVMUM -DVMUM_V1 -I../ bench.c && run "01vMUM-V1" "./a.out"
    ${CXX} -DDATA_LEN=$i ${COPTFLAGS} -w -fpermissive -DMUM -I../ bench.c && run "02MUM-V4" "./a.out"
    ${CXX} -DDATA_LEN=$i ${COPTFLAGS} -w -fpermissive -DMUM -DMUM_V3 -I../ bench.c && run "03MUM-V3" "./a.out"
    if test x${MUM_ONLY} == x; then
	${CXX} -DDATA_LEN=$i ${COPTFLAGS} ${LTO} -w -fpermissive -DSpooky SpookyV2.o bench.c && run "04Spooky" "./a.out"
	${CXX} -DDATA_LEN=$i ${COPTFLAGS} ${LTO} -w -fpermissive -DCity City.o bench.c && run "05City" "./a.out"
	if test "$check_xxHash" == yes;then
	    ${CXX} -DDATA_LEN=$i ${COPTFLAGS} ${LTO} -w -fpermissive -DxxHash bench.c && run "06xxHash" "./a.out"
	fi
	if test "$check_rapid" == yes;then
            ${CXX} -DDATA_LEN=$i ${COPTFLAGS} -w -fpermissive -DRAPID -I../ bench.c && run "07vRAPID" "./a.out"
	fi
	${CXX} -DDATA_LEN=$i ${COPTFLAGS} ${LTO} -w -fpermissive -Dxxh3 bench.c && run "08xxh3" "./a.out"
	${CC} -DDATA_LEN=$i ${COPTFLAGS} ${LTO} -w -fpermissive -It1ha -DT1HA2 t1ha*.o bench.c && run "09t1ha2" "./a.out"
	${CC} -DDATA_LEN=$i ${COPTFLAGS} ${LTO} -w -fpermissive -DSipHash siphash24.o bench.c && run "10Siphash24" "./a.out"
	${CXX} -DDATA_LEN=$i ${COPTFLAGS} ${LTO} -w -fpermissive -DMETRO metrohash64.o -I../ bench.c && run "11Metro" "./a.out"
	if test "$check_meow" == yes && test $mach == x86_64; then
	    ${CXX} -DDATA_LEN=$i ${COPTFLAGS} -w -mavx2 -maes -fpermissive -DMeowHash -I../ bench.c && run "12Meowhash" "./a.out"
	elif test "$check_meow" == yes && test $mach == aarch64; then
	    ${CXX} -DDATA_LEN=$i ${COPTFLAGS} -w -march=native -fpermissive -DMeowHash -I../ bench.c && run "13Meowhash" "./a.out"
	fi
    fi
    echo
done

echo -n '| Average   |'
for i in `awk -F: '{print $1}' $temp3|sort|uniq`; do
    printf '%-10.2f |' `awk -F: -v name="$i" 'name==$1 {f = f + $2; n++} END {printf "%0.2f\n", f / n}' $temp3`
done
echo

echo -n '| Geomean   |'
for i in `awk -F: '{print $1}' $temp3|sort|uniq`; do
    printf '%-10.2f |' `awk -F: -v name="$i" 'BEGIN{f=1.0} name==$1 {f = f * $2; n++} END {printf "%0.2f\n", exp (log(f)/n)}' $temp3`
done
echo

rm -rf ./a.out $temp $temp2 $temp3 SpookyV2.o City.o siphash24.o t1ha*.o


================================================
FILE: benchmarks/blake2-config.h
================================================
/*
   BLAKE2 reference source code package - optimized C implementations

   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
   your option.  The terms of these licenses can be found at:

   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
   - OpenSSL license   : https://www.openssl.org/source/license.html
   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0

   More information about the BLAKE2 hash function can be found at
   https://blake2.net.
*/
#pragma once
#ifndef __BLAKE2_CONFIG_H__
#define __BLAKE2_CONFIG_H__

/* These don't work everywhere */
#if defined(__SSE2__) || defined(__x86_64__) || defined(__amd64__)
#define HAVE_SSE2
#endif

#if defined(__SSSE3__)
#define HAVE_SSSE3
#endif

#if defined(__SSE4_1__)
#define HAVE_SSE41
#endif

#if defined(__AVX__)
#define HAVE_AVX
#endif

#if defined(__XOP__)
#define HAVE_XOP
#endif


#ifdef HAVE_AVX2
#ifndef HAVE_AVX
#define HAVE_AVX
#endif
#endif

#ifdef HAVE_XOP
#ifndef HAVE_AVX
#define HAVE_AVX
#endif
#endif

#ifdef HAVE_AVX
#ifndef HAVE_SSE41
#define HAVE_SSE41
#endif
#endif

#ifdef HAVE_SSE41
#ifndef HAVE_SSSE3
#define HAVE_SSSE3
#endif
#endif

#ifdef HAVE_SSSE3
#define HAVE_SSE2
#endif

#if !defined(HAVE_SSE2)
#error "This code requires at least SSE2."
#endif

#endif


================================================
FILE: benchmarks/blake2-impl.h
================================================
/*
   BLAKE2 reference source code package - optimized C implementations
  
   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
   your option.  The terms of these licenses can be found at:
  
   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
   - OpenSSL license   : https://www.openssl.org/source/license.html
   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
  
   More information about the BLAKE2 hash function can be found at
   https://blake2.net.
*/
#pragma once
#ifndef __BLAKE2_IMPL_H__
#define __BLAKE2_IMPL_H__

#include <stdint.h>
#include <string.h>

BLAKE2_LOCAL_INLINE(uint32_t) load32( const void *src )
{
#if defined(NATIVE_LITTLE_ENDIAN)
  uint32_t w;
  memcpy(&w, src, sizeof w);
  return w;
#else
  const uint8_t *p = ( const uint8_t * )src;
  uint32_t w = *p++;
  w |= ( uint32_t )( *p++ ) <<  8;
  w |= ( uint32_t )( *p++ ) << 16;
  w |= ( uint32_t )( *p++ ) << 24;
  return w;
#endif
}

BLAKE2_LOCAL_INLINE(uint64_t) load64( const void *src )
{
#if defined(NATIVE_LITTLE_ENDIAN)
  uint64_t w;
  memcpy(&w, src, sizeof w);
  return w;
#else
  const uint8_t *p = ( const uint8_t * )src;
  uint64_t w = *p++;
  w |= ( uint64_t )( *p++ ) <<  8;
  w |= ( uint64_t )( *p++ ) << 16;
  w |= ( uint64_t )( *p++ ) << 24;
  w |= ( uint64_t )( *p++ ) << 32;
  w |= ( uint64_t )( *p++ ) << 40;
  w |= ( uint64_t )( *p++ ) << 48;
  w |= ( uint64_t )( *p++ ) << 56;
  return w;
#endif
}

BLAKE2_LOCAL_INLINE(void) store32( void *dst, uint32_t w )
{
#if defined(NATIVE_LITTLE_ENDIAN)
  memcpy(dst, &w, sizeof w);
#else
  uint8_t *p = ( uint8_t * )dst;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w;
#endif
}

BLAKE2_LOCAL_INLINE(void) store64( void *dst, uint64_t w )
{
#if defined(NATIVE_LITTLE_ENDIAN)
  memcpy(dst, &w, sizeof w);
#else
  uint8_t *p = ( uint8_t * )dst;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w;
#endif
}

BLAKE2_LOCAL_INLINE(uint64_t) load48( const void *src )
{
  const uint8_t *p = ( const uint8_t * )src;
  uint64_t w = *p++;
  w |= ( uint64_t )( *p++ ) <<  8;
  w |= ( uint64_t )( *p++ ) << 16;
  w |= ( uint64_t )( *p++ ) << 24;
  w |= ( uint64_t )( *p++ ) << 32;
  w |= ( uint64_t )( *p++ ) << 40;
  return w;
}

BLAKE2_LOCAL_INLINE(void) store48( void *dst, uint64_t w )
{
  uint8_t *p = ( uint8_t * )dst;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w; w >>= 8;
  *p++ = ( uint8_t )w;
}

BLAKE2_LOCAL_INLINE(uint32_t) rotl32( const uint32_t w, const unsigned c )
{
  return ( w << c ) | ( w >> ( 32 - c ) );
}

BLAKE2_LOCAL_INLINE(uint64_t) rotl64( const uint64_t w, const unsigned c )
{
  return ( w << c ) | ( w >> ( 64 - c ) );
}

BLAKE2_LOCAL_INLINE(uint32_t) rotr32( const uint32_t w, const unsigned c )
{
  return ( w >> c ) | ( w << ( 32 - c ) );
}

BLAKE2_LOCAL_INLINE(uint64_t) rotr64( const uint64_t w, const unsigned c )
{
  return ( w >> c ) | ( w << ( 64 - c ) );
}

/* prevents compiler optimizing out memset() */
BLAKE2_LOCAL_INLINE(void) secure_zero_memory(void *v, size_t n)
{
  static void *(*const volatile memset_v)(void *, int, size_t) = &memset;
  memset_v(v, 0, n);
}

#endif


================================================
FILE: benchmarks/blake2.h
================================================
/*
   BLAKE2 reference source code package - reference C implementations
  
   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
   your option.  The terms of these licenses can be found at:
  
   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
   - OpenSSL license   : https://www.openssl.org/source/license.html
   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
  
   More information about the BLAKE2 hash function can be found at
   https://blake2.net.
*/
#pragma once
#ifndef __BLAKE2_H__
#define __BLAKE2_H__

#include <stddef.h>
#include <stdint.h>

#ifdef BLAKE2_NO_INLINE
#define BLAKE2_LOCAL_INLINE(type) static type
#endif

#ifndef BLAKE2_LOCAL_INLINE
#define BLAKE2_LOCAL_INLINE(type) static inline type
#endif

#if defined(__cplusplus)
extern "C" {
#endif

  enum blake2s_constant
  {
    BLAKE2S_BLOCKBYTES = 64,
    BLAKE2S_OUTBYTES   = 32,
    BLAKE2S_KEYBYTES   = 32,
    BLAKE2S_SALTBYTES  = 8,
    BLAKE2S_PERSONALBYTES = 8
  };

  enum blake2b_constant
  {
    BLAKE2B_BLOCKBYTES = 128,
    BLAKE2B_OUTBYTES   = 64,
    BLAKE2B_KEYBYTES   = 64,
    BLAKE2B_SALTBYTES  = 16,
    BLAKE2B_PERSONALBYTES = 16
  };

  typedef struct __blake2s_state
  {
    uint32_t h[8];
    uint32_t t[2];
    uint32_t f[2];
    uint8_t  buf[2 * BLAKE2S_BLOCKBYTES];
    size_t   buflen;
    uint8_t  last_node;
  } blake2s_state;

  typedef struct __blake2b_state
  {
    uint64_t h[8];
    uint64_t t[2];
    uint64_t f[2];
    uint8_t  buf[2 * BLAKE2B_BLOCKBYTES];
    size_t   buflen;
    uint8_t  last_node;
  } blake2b_state;

  typedef struct __blake2sp_state
  {
    blake2s_state S[8][1];
    blake2s_state R[1];
    uint8_t buf[8 * BLAKE2S_BLOCKBYTES];
    size_t  buflen;
  } blake2sp_state;

  typedef struct __blake2bp_state
  {
    blake2b_state S[4][1];
    blake2b_state R[1];
    uint8_t buf[4 * BLAKE2B_BLOCKBYTES];
    size_t  buflen;
  } blake2bp_state;


#pragma pack(push, 1)
  typedef struct __blake2s_param
  {
    uint8_t  digest_length; /* 1 */
    uint8_t  key_length;    /* 2 */
    uint8_t  fanout;        /* 3 */
    uint8_t  depth;         /* 4 */
    uint32_t leaf_length;   /* 8 */
    uint8_t  node_offset[6];// 14
    uint8_t  node_depth;    /* 15 */
    uint8_t  inner_length;  /* 16 */
    /* uint8_t  reserved[0]; */
    uint8_t  salt[BLAKE2S_SALTBYTES]; /* 24 */
    uint8_t  personal[BLAKE2S_PERSONALBYTES];  /* 32 */
  } blake2s_param;

  typedef struct __blake2b_param
  {
    uint8_t  digest_length; /* 1 */
    uint8_t  key_length;    /* 2 */
    uint8_t  fanout;        /* 3 */
    uint8_t  depth;         /* 4 */
    uint32_t leaf_length;   /* 8 */
    uint64_t node_offset;   /* 16 */
    uint8_t  node_depth;    /* 17 */
    uint8_t  inner_length;  /* 18 */
    uint8_t  reserved[14];  /* 32 */
    uint8_t  salt[BLAKE2B_SALTBYTES]; /* 48 */
    uint8_t  personal[BLAKE2B_PERSONALBYTES];  /* 64 */
  } blake2b_param;
#pragma pack(pop)

  /* Streaming API */
  int blake2s_init( blake2s_state *S, const uint8_t outlen );
  int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
  int blake2s_init_param( blake2s_state *S, const blake2s_param *P );
  int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen );
  int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen );

  int blake2b_init( blake2b_state *S, const uint8_t outlen );
  int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
  int blake2b_init_param( blake2b_state *S, const blake2b_param *P );
  int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen );
  int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen );

  int blake2sp_init( blake2sp_state *S, const uint8_t outlen );
  int blake2sp_init_key( blake2sp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
  int blake2sp_update( blake2sp_state *S, const uint8_t *in, uint64_t inlen );
  int blake2sp_final( blake2sp_state *S, uint8_t *out, uint8_t outlen );

  int blake2bp_init( blake2bp_state *S, const uint8_t outlen );
  int blake2bp_init_key( blake2bp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
  int blake2bp_update( blake2bp_state *S, const uint8_t *in, uint64_t inlen );
  int blake2bp_final( blake2bp_state *S, uint8_t *out, uint8_t outlen );

  /* Simple API */
  int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
  int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );

  int blake2sp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
  int blake2bp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );

  static inline int blake2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
  {
    return blake2b( out, in, key, outlen, inlen, keylen );
  }

#if defined(__cplusplus)
}
#endif

#endif


================================================
FILE: benchmarks/blake2b-load-sse2.h
================================================
/*
   BLAKE2 reference source code package - optimized C implementations
  
   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
   your option.  The terms of these licenses can be found at:
  
   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
   - OpenSSL license   : https://www.openssl.org/source/license.html
   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
  
   More information about the BLAKE2 hash function can be found at
   https://blake2.net.
*/
#pragma once
#ifndef __BLAKE2B_LOAD_SSE2_H__
#define __BLAKE2B_LOAD_SSE2_H__

#define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
#define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5)
#define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2)
#define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7)
#define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1)
#define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13)
#define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2)
#define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4)
#define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6)
#define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8)
#define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0)
#define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11)
#define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15)
#define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14)
#define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14)
#define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13)
#define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9)
#define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2)
#define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12)
#define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1)
#define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8)
#define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6)
#define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11)
#define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3)
#define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1)
#define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4)
#define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7)
#define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6)
#define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3)
#define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12)
#define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
#define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
#define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
#define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
#define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
#define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
#define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
#define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)


#endif


================================================
FILE: benchmarks/blake2b-load-sse41.h
================================================
/*
   BLAKE2 reference source code package - optimized C implementations
  
   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
   your option.  The terms of these licenses can be found at:
  
   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
   - OpenSSL license   : https://www.openssl.org/source/license.html
   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
  
   More information about the BLAKE2 hash function can be found at
   https://blake2.net.
*/
#pragma once
#ifndef __BLAKE2B_LOAD_SSE41_H__
#define __BLAKE2B_LOAD_SSE41_H__

#define LOAD_MSG_0_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while(0)


#define LOAD_MSG_0_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while(0)


#define LOAD_MSG_0_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)


#define LOAD_MSG_0_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while(0)


#define LOAD_MSG_1_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while(0)


#define LOAD_MSG_1_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while(0)


#define LOAD_MSG_1_3(b0, b1) \
do \
{ \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while(0)


#define LOAD_MSG_1_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while(0)


#define LOAD_MSG_2_1(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m6, m5, 8); \
b1 = _mm_unpackhi_epi64(m2, m7); \
} while(0)


#define LOAD_MSG_2_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m0); \
b1 = _mm_blend_epi16(m1, m6, 0xF0); \
} while(0)


#define LOAD_MSG_2_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m5, m1, 0xF0); \
b1 = _mm_unpackhi_epi64(m3, m4); \
} while(0)


#define LOAD_MSG_2_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m3); \
b1 = _mm_alignr_epi8(m2, m0, 8); \
} while(0)


#define LOAD_MSG_3_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_unpackhi_epi64(m6, m5); \
} while(0)


#define LOAD_MSG_3_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m0); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)


#define LOAD_MSG_3_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m1, m2, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while(0)


#define LOAD_MSG_3_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m3, m5); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while(0)


#define LOAD_MSG_4_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m2); \
b1 = _mm_unpacklo_epi64(m1, m5); \
} while(0)


#define LOAD_MSG_4_2(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m0, m3, 0xF0); \
b1 = _mm_blend_epi16(m2, m7, 0xF0); \
} while(0)


#define LOAD_MSG_4_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m7, m5, 0xF0); \
b1 = _mm_blend_epi16(m3, m1, 0xF0); \
} while(0)


#define LOAD_MSG_4_4(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m6, m0, 8); \
b1 = _mm_blend_epi16(m4, m6, 0xF0); \
} while(0)


#define LOAD_MSG_5_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m1, m3); \
b1 = _mm_unpacklo_epi64(m0, m4); \
} while(0)


#define LOAD_MSG_5_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m5); \
b1 = _mm_unpackhi_epi64(m5, m1); \
} while(0)


#define LOAD_MSG_5_3(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m2, m3, 0xF0); \
b1 = _mm_unpackhi_epi64(m7, m0); \
} while(0)


#define LOAD_MSG_5_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m6, m2); \
b1 = _mm_blend_epi16(m7, m4, 0xF0); \
} while(0)


#define LOAD_MSG_6_1(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m6, m0, 0xF0); \
b1 = _mm_unpacklo_epi64(m7, m2); \
} while(0)


#define LOAD_MSG_6_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_alignr_epi8(m5, m6, 8); \
} while(0)


#define LOAD_MSG_6_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m3); \
b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
} while(0)


#define LOAD_MSG_6_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m3, m1); \
b1 = _mm_blend_epi16(m1, m5, 0xF0); \
} while(0)


#define LOAD_MSG_7_1(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m6, m3); \
b1 = _mm_blend_epi16(m6, m1, 0xF0); \
} while(0)


#define LOAD_MSG_7_2(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpackhi_epi64(m0, m4); \
} while(0)


#define LOAD_MSG_7_3(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m2, m7); \
b1 = _mm_unpacklo_epi64(m4, m1); \
} while(0)


#define LOAD_MSG_7_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m2); \
b1 = _mm_unpacklo_epi64(m3, m5); \
} while(0)


#define LOAD_MSG_8_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m3, m7); \
b1 = _mm_alignr_epi8(m0, m5, 8); \
} while(0)


#define LOAD_MSG_8_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_alignr_epi8(m4, m1, 8); \
} while(0)


#define LOAD_MSG_8_3(b0, b1) \
do \
{ \
b0 = m6; \
b1 = _mm_alignr_epi8(m5, m0, 8); \
} while(0)


#define LOAD_MSG_8_4(b0, b1) \
do \
{ \
b0 = _mm_blend_epi16(m1, m3, 0xF0); \
b1 = m2; \
} while(0)


#define LOAD_MSG_9_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_unpackhi_epi64(m3, m0); \
} while(0)


#define LOAD_MSG_9_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m1, m2); \
b1 = _mm_blend_epi16(m3, m2, 0xF0); \
} while(0)


#define LOAD_MSG_9_3(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m7, m4); \
b1 = _mm_unpackhi_epi64(m1, m6); \
} while(0)


#define LOAD_MSG_9_4(b0, b1) \
do \
{ \
b0 = _mm_alignr_epi8(m7, m5, 8); \
b1 = _mm_unpacklo_epi64(m6, m0); \
} while(0)


#define LOAD_MSG_10_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m0, m1); \
b1 = _mm_unpacklo_epi64(m2, m3); \
} while(0)


#define LOAD_MSG_10_2(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m0, m1); \
b1 = _mm_unpackhi_epi64(m2, m3); \
} while(0)


#define LOAD_MSG_10_3(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m4, m5); \
b1 = _mm_unpacklo_epi64(m6, m7); \
} while(0)


#define LOAD_MSG_10_4(b0, b1) \
do \
{ \
b0 = _mm_unpackhi_epi64(m4, m5); \
b1 = _mm_unpackhi_epi64(m6, m7); \
} while(0)


#define LOAD_MSG_11_1(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m7, m2); \
b1 = _mm_unpackhi_epi64(m4, m6); \
} while(0)


#define LOAD_MSG_11_2(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m5, m4); \
b1 = _mm_alignr_epi8(m3, m7, 8); \
} while(0)


#define LOAD_MSG_11_3(b0, b1) \
do \
{ \
b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
b1 = _mm_unpackhi_epi64(m5, m2); \
} while(0)


#define LOAD_MSG_11_4(b0, b1) \
do \
{ \
b0 = _mm_unpacklo_epi64(m6, m1); \
b1 = _mm_unpackhi_epi64(m3, m1); \
} while(0)


#endif


================================================
FILE: benchmarks/blake2b-round.h
================================================
/*
   BLAKE2 reference source code package - optimized C implementations
  
   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
   your option.  The terms of these licenses can be found at:
  
   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
   - OpenSSL license   : https://www.openssl.org/source/license.html
   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
  
   More information about the BLAKE2 hash function can be found at
   https://blake2.net.
*/
#pragma once
#ifndef __BLAKE2B_ROUND_H__
#define __BLAKE2B_ROUND_H__

#define LOADU(p)  _mm_loadu_si128( (const __m128i *)(p) )
#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)

#define TOF(reg) _mm_castsi128_ps((reg))
#define TOI(reg) _mm_castps_si128((reg))

#define LIKELY(x) __builtin_expect((x),1)


/* Microarchitecture-specific macros */
#ifndef HAVE_XOP
#ifdef HAVE_SSSE3
#define _mm_roti_epi64(x, c) \
    (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
    : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
    : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
    : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
    : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
#else
#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-(c)) ))
#endif
#else
/* ... */
#endif


#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
  \
  row4l = _mm_xor_si128(row4l, row1l); \
  row4h = _mm_xor_si128(row4h, row1h); \
  \
  row4l = _mm_roti_epi64(row4l, -32); \
  row4h = _mm_roti_epi64(row4h, -32); \
  \
  row3l = _mm_add_epi64(row3l, row4l); \
  row3h = _mm_add_epi64(row3h, row4h); \
  \
  row2l = _mm_xor_si128(row2l, row3l); \
  row2h = _mm_xor_si128(row2h, row3h); \
  \
  row2l = _mm_roti_epi64(row2l, -24); \
  row2h = _mm_roti_epi64(row2h, -24); \
 
#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
  \
  row4l = _mm_xor_si128(row4l, row1l); \
  row4h = _mm_xor_si128(row4h, row1h); \
  \
  row4l = _mm_roti_epi64(row4l, -16); \
  row4h = _mm_roti_epi64(row4h, -16); \
  \
  row3l = _mm_add_epi64(row3l, row4l); \
  row3h = _mm_add_epi64(row3h, row4h); \
  \
  row2l = _mm_xor_si128(row2l, row3l); \
  row2h = _mm_xor_si128(row2h, row3h); \
  \
  row2l = _mm_roti_epi64(row2l, -63); \
  row2h = _mm_roti_epi64(row2h, -63); \
 
#if defined(HAVE_SSSE3)
#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
  t0 = _mm_alignr_epi8(row2h, row2l, 8); \
  t1 = _mm_alignr_epi8(row2l, row2h, 8); \
  row2l = t0; \
  row2h = t1; \
  \
  t0 = row3l; \
  row3l = row3h; \
  row3h = t0;    \
  \
  t0 = _mm_alignr_epi8(row4h, row4l, 8); \
  t1 = _mm_alignr_epi8(row4l, row4h, 8); \
  row4l = t1; \
  row4h = t0;

#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
  t0 = _mm_alignr_epi8(row2l, row2h, 8); \
  t1 = _mm_alignr_epi8(row2h, row2l, 8); \
  row2l = t0; \
  row2h = t1; \
  \
  t0 = row3l; \
  row3l = row3h; \
  row3h = t0; \
  \
  t0 = _mm_alignr_epi8(row4l, row4h, 8); \
  t1 = _mm_alignr_epi8(row4h, row4l, 8); \
  row4l = t1; \
  row4h = t0;
#else

#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
  t0 = row4l;\
  t1 = row2l;\
  row4l = row3l;\
  row3l = row3h;\
  row3h = row4l;\
  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))

#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
  t0 = row3l;\
  row3l = row3h;\
  row3h = t0;\
  t0 = row2l;\
  t1 = row4l;\
  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))

#endif

#if defined(HAVE_SSE41)
#include "blake2b-load-sse41.h"
#else
#include "blake2b-load-sse2.h"
#endif

#define ROUND(r) \
  LOAD_MSG_ ##r ##_1(b0, b1); \
  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  LOAD_MSG_ ##r ##_2(b0, b1); \
  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
  LOAD_MSG_ ##r ##_3(b0, b1); \
  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  LOAD_MSG_ ##r ##_4(b0, b1); \
  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
  UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);

#endif


================================================
FILE: benchmarks/blake2b.c
================================================
/*
   BLAKE2 reference source code package - optimized C implementations
  
   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
   your option.  The terms of these licenses can be found at:
  
   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
   - OpenSSL license   : https://www.openssl.org/source/license.html
   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
  
   More information about the BLAKE2 hash function can be found at
   https://blake2.net.
*/

#include <stdint.h>
#include <string.h>
#include <stdio.h>

#include "blake2.h"
#include "blake2-impl.h"

#include "blake2-config.h"

#ifdef _MSC_VER
#include <intrin.h> /* for _mm_set_epi64x */
#endif
#include <emmintrin.h>
#if defined(HAVE_SSSE3)
#include <tmmintrin.h>
#endif
#if defined(HAVE_SSE41)
#include <smmintrin.h>
#endif
#if defined(HAVE_AVX)
#include <immintrin.h>
#endif
#if defined(HAVE_XOP)
#include <x86intrin.h>
#endif

#include "blake2b-round.h"

static const uint64_t blake2b_IV[8] =
{
  0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
  0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
  0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
};

static const uint8_t blake2b_sigma[12][16] =
{
  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
  { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
  { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
  {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
  {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
  {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
  { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
  { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
  {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
  { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
  { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 }
};


/* Some helper functions, not necessarily useful */
BLAKE2_LOCAL_INLINE(int) blake2b_set_lastnode( blake2b_state *S )
{
  S->f[1] = -1;
  return 0;
}

BLAKE2_LOCAL_INLINE(int) blake2b_clear_lastnode( blake2b_state *S )
{
  S->f[1] = 0;
  return 0;
}

BLAKE2_LOCAL_INLINE(int) blake2b_is_lastblock( const blake2b_state *S )
{
  return S->f[0] != 0;
}

BLAKE2_LOCAL_INLINE(int) blake2b_set_lastblock( blake2b_state *S )
{
  if( S->last_node ) blake2b_set_lastnode( S );

  S->f[0] = -1;
  return 0;
}

BLAKE2_LOCAL_INLINE(int) blake2b_clear_lastblock( blake2b_state *S )
{
  if( S->last_node ) blake2b_clear_lastnode( S );

  S->f[0] = 0;
  return 0;
}


BLAKE2_LOCAL_INLINE(int) blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
{
#if __x86_64__
  /* ADD/ADC chain */
  __uint128_t t = ( ( __uint128_t )S->t[1] << 64 ) | S->t[0];
  t += inc;
  S->t[0] = ( uint64_t )( t >>  0 );
  S->t[1] = ( uint64_t )( t >> 64 );
#else
  S->t[0] += inc;
  S->t[1] += ( S->t[0] < inc );
#endif
  return 0;
}


/* Parameter-related functions */
BLAKE2_LOCAL_INLINE(int) blake2b_param_set_digest_length( blake2b_param *P, const uint8_t digest_length )
{
  P->digest_length = digest_length;
  return 0;
}

BLAKE2_LOCAL_INLINE(int) blake2b_param_set_fanout( blake2b_param *P, const uint8_t fanout )
{
  P->fanout = fanout;
  return 0;
}

BLAKE2_LOCAL_INLINE(int) blake2b_param_set_max_depth( blake2b_param *P, const uint8_t depth )
{
  P->depth = depth;
  return 0;
}

BLAKE2_LOCAL_INLINE(int) blake2b_param_set_leaf_length( blake2b_param *P, const uint32_t leaf_length )
{
  P->leaf_length = leaf_length;
  return 0;
}

BLAKE2_LOCAL_INLINE(int) blake2b_param_set_node_offset( blake2b_param *P, const uint64_t node_offset )
{
  P->node_offset = node_offset;
  return 0;
}

BLAKE2_LOCAL_INLINE(int) blake2b_param_set_node_depth( blake2b_param *P, const uint8_t node_depth )
{
  P->node_depth = node_depth;
  return 0;
}

BLAKE2_LOCAL_INLINE(int) blake2b_param_set_inner_length( blake2b_param *P, const uint8_t inner_length )
{
  P->inner_length = inner_length;
  return 0;
}

BLAKE2_LOCAL_INLINE(int) blake2b_param_set_salt( blake2b_param *P, const uint8_t salt[BLAKE2B_SALTBYTES] )
{
  memcpy( P->salt, salt, BLAKE2B_SALTBYTES );
  return 0;
}

BLAKE2_LOCAL_INLINE(int) blake2b_param_set_personal( blake2b_param *P, const uint8_t personal[BLAKE2B_PERSONALBYTES] )
{
  memcpy( P->personal, personal, BLAKE2B_PERSONALBYTES );
  return 0;
}

BLAKE2_LOCAL_INLINE(int) blake2b_init0( blake2b_state *S )
{
  memset( S, 0, sizeof( blake2b_state ) );

  for( int i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i];

  return 0;
}

/* init xors IV with input parameter block */
int blake2b_init_param( blake2b_state *S, const blake2b_param *P )
{
  /*blake2b_init0( S ); */
  const uint8_t * v = ( const uint8_t * )( blake2b_IV );
  const uint8_t * p = ( const uint8_t * )( P );
  uint8_t * h = ( uint8_t * )( S->h );
  /* IV XOR ParamBlock */
  memset( S, 0, sizeof( blake2b_state ) );

  for( int i = 0; i < BLAKE2B_OUTBYTES; ++i ) h[i] = v[i] ^ p[i];

  return 0;
}


/* Some sort of default parameter block initialization, for sequential blake2b */
int blake2b_init( blake2b_state *S, const uint8_t outlen )
{
  const blake2b_param P =
  {
    outlen,
    0,
    1,
    1,
    0,
    0,
    0,
    0,
    {0},
    {0},
    {0}
  };

  if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;

  return blake2b_init_param( S, &P );
}

int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen )
{
  const blake2b_param P =
  {
    outlen,
    keylen,
    1,
    1,
    0,
    0,
    0,
    0,
    {0},
    {0},
    {0}
  };

  if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;

  if ( ( !keylen ) || keylen > BLAKE2B_KEYBYTES ) return -1;

  if( blake2b_init_param( S, &P ) < 0 )
    return 0;

  {
    uint8_t block[BLAKE2B_BLOCKBYTES];
    memset( block, 0, BLAKE2B_BLOCKBYTES );
    memcpy( block, key, keylen );
    blake2b_update( S, block, BLAKE2B_BLOCKBYTES );
    secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
  }
  return 0;
}

BLAKE2_LOCAL_INLINE(int) blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
{
  __m128i row1l, row1h;
  __m128i row2l, row2h;
  __m128i row3l, row3h;
  __m128i row4l, row4h;
  __m128i b0, b1;
  __m128i t0, t1;
#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
  const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 );
  const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 );
#endif
#if defined(HAVE_SSE41)
  const __m128i m0 = LOADU( block + 00 );
  const __m128i m1 = LOADU( block + 16 );
  const __m128i m2 = LOADU( block + 32 );
  const __m128i m3 = LOADU( block + 48 );
  const __m128i m4 = LOADU( block + 64 );
  const __m128i m5 = LOADU( block + 80 );
  const __m128i m6 = LOADU( block + 96 );
  const __m128i m7 = LOADU( block + 112 );
#else
  const uint64_t  m0 = ( ( uint64_t * )block )[ 0];
  const uint64_t  m1 = ( ( uint64_t * )block )[ 1];
  const uint64_t  m2 = ( ( uint64_t * )block )[ 2];
  const uint64_t  m3 = ( ( uint64_t * )block )[ 3];
  const uint64_t  m4 = ( ( uint64_t * )block )[ 4];
  const uint64_t  m5 = ( ( uint64_t * )block )[ 5];
  const uint64_t  m6 = ( ( uint64_t * )block )[ 6];
  const uint64_t  m7 = ( ( uint64_t * )block )[ 7];
  const uint64_t  m8 = ( ( uint64_t * )block )[ 8];
  const uint64_t  m9 = ( ( uint64_t * )block )[ 9];
  const uint64_t m10 = ( ( uint64_t * )block )[10];
  const uint64_t m11 = ( ( uint64_t * )block )[11];
  const uint64_t m12 = ( ( uint64_t * )block )[12];
  const uint64_t m13 = ( ( uint64_t * )block )[13];
  const uint64_t m14 = ( ( uint64_t * )block )[14];
  const uint64_t m15 = ( ( uint64_t * )block )[15];
#endif
  row1l = LOADU( &S->h[0] );
  row1h = LOADU( &S->h[2] );
  row2l = LOADU( &S->h[4] );
  row2h = LOADU( &S->h[6] );
  row3l = LOADU( &blake2b_IV[0] );
  row3h = LOADU( &blake2b_IV[2] );
  row4l = _mm_xor_si128( LOADU( &blake2b_IV[4] ), LOADU( &S->t[0] ) );
  row4h = _mm_xor_si128( LOADU( &blake2b_IV[6] ), LOADU( &S->f[0] ) );
  ROUND( 0 );
  ROUND( 1 );
  ROUND( 2 );
  ROUND( 3 );
  ROUND( 4 );
  ROUND( 5 );
  ROUND( 6 );
  ROUND( 7 );
  ROUND( 8 );
  ROUND( 9 );
  ROUND( 10 );
  ROUND( 11 );
  row1l = _mm_xor_si128( row3l, row1l );
  row1h = _mm_xor_si128( row3h, row1h );
  STOREU( &S->h[0], _mm_xor_si128( LOADU( &S->h[0] ), row1l ) );
  STOREU( &S->h[2], _mm_xor_si128( LOADU( &S->h[2] ), row1h ) );
  row2l = _mm_xor_si128( row4l, row2l );
  row2h = _mm_xor_si128( row4h, row2h );
  STOREU( &S->h[4], _mm_xor_si128( LOADU( &S->h[4] ), row2l ) );
  STOREU( &S->h[6], _mm_xor_si128( LOADU( &S->h[6] ), row2h ) );
  return 0;
}


int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen )
{
  while( inlen > 0 )
  {
    size_t left = S->buflen;
    size_t fill = 2 * BLAKE2B_BLOCKBYTES - left;

    if( inlen > fill )
    {
      memcpy( S->buf + left, in, fill ); /* Fill buffer */
      S->buflen += fill;
      blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
      blake2b_compress( S, S->buf ); /* Compress */
      memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); /* Shift buffer left */
      S->buflen -= BLAKE2B_BLOCKBYTES;
      in += fill;
      inlen -= fill;
    }
    else /* inlen <= fill */
    {
      memcpy( S->buf + left, in, inlen );
      S->buflen += inlen; /* Be lazy, do not compress */
      in += inlen;
      inlen -= inlen;
    }
  }

  return 0;
}


int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen )
{
  if( outlen > BLAKE2B_OUTBYTES )
    return -1;

  if( blake2b_is_lastblock( S ) )
    return -1;

  if( S->buflen > BLAKE2B_BLOCKBYTES )
  {
    blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
    blake2b_compress( S, S->buf );
    S->buflen -= BLAKE2B_BLOCKBYTES;
    memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen );
  }

  blake2b_increment_counter( S, S->buflen );
  blake2b_set_lastblock( S );
  memset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
  blake2b_compress( S, S->buf );
  memcpy( out, &S->h[0], outlen );
  return 0;
}


int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
{
  blake2b_state S[1];

  /* Verify parameters */
  if ( NULL == in && inlen > 0 ) return -1;

  if ( NULL == out ) return -1;

  if( NULL == key && keylen > 0 ) return -1;

  if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1;

  if( keylen > BLAKE2B_KEYBYTES ) return -1;

  if( keylen )
  {
    if( blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1;
  }
  else
  {
    if( blake2b_init( S, outlen ) < 0 ) return -1;
  }

  blake2b_update( S, ( const uint8_t * )in, inlen );
  blake2b_final( S, out, outlen );
  return 0;
}

#if defined(SUPERCOP)
int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
{
  return blake2b( out, in, NULL, BLAKE2B_OUTBYTES, inlen, 0 );
}
#endif

#if defined(BLAKE2B_SELFTEST)
#include <string.h>
#include "blake2-kat.h"
int main( int argc, char **argv )
{
  uint8_t key[BLAKE2B_KEYBYTES];
  uint8_t buf[KAT_LENGTH];

  for( size_t i = 0; i < BLAKE2B_KEYBYTES; ++i )
    key[i] = ( uint8_t )i;

  for( size_t i = 0; i < KAT_LENGTH; ++i )
    buf[i] = ( uint8_t )i;

  for( size_t i = 0; i < KAT_LENGTH; ++i )
  {
    uint8_t hash[BLAKE2B_OUTBYTES];
    blake2b( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES );

    if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) )
    {
      puts( "error" );
      return -1;
    }
  }

  puts( "ok" );
  return 0;
}
#endif


================================================
FILE: benchmarks/byte_order.c
================================================
/* byte_order.c - byte order related platform dependent routines,
 *
 * Copyright: 2008-2012 Aleksey Kravchenko <rhash.admin@gmail.com>
 *
 * Permission is hereby granted,  free of charge,  to any person  obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction,  including without limitation
 * the rights to  use, copy, modify,  merge, publish, distribute, sublicense,
 * and/or sell copies  of  the Software,  and to permit  persons  to whom the
 * Software is furnished to do so.
 *
 * This program  is  distributed  in  the  hope  that it will be useful,  but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  Use this program  at  your own risk!
 */
#include "byte_order.h"

#if !(__GNUC__ >= 4 || (__GNUC__ ==3 && __GNUC_MINOR__ >= 4)) /* if !GCC or GCC < 4.3 */

#  if _MSC_VER >= 1300 && (_M_IX86 || _M_AMD64 || _M_IA64) /* if MSVC++ >= 2002 on x86/x64 */
#  include <intrin.h>
#  pragma intrinsic(_BitScanForward)

/**
 * Returns index of the trailing bit of x.
 *
 * @param x the number to process
 * @return zero-based index of the trailing bit
 */
unsigned rhash_ctz(unsigned x)
{
	unsigned long index;
	unsigned char isNonzero = _BitScanForward(&index, x); /* MSVC intrinsic */
	return (isNonzero ? (unsigned)index : 0);
}
#  else /* _MSC_VER >= 1300... */

/**
 * Returns index of the trailing bit of a 32-bit number.
 * This is a plain C equivalent for GCC __builtin_ctz() bit scan.
 *
 * @param x the number to process
 * @return zero-based index of the trailing bit
 */
unsigned rhash_ctz(unsigned x)
{
	/* array for conversion to bit position */
	static unsigned char bit_pos[32] =  {
		0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
		31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
	};

	/* The De Bruijn bit-scan was devised in 1997, according to Donald Knuth
	 * by Martin Lauter. The constant 0x077CB531UL is a De Bruijn sequence,
	 * which produces a unique pattern of bits into the high 5 bits for each
	 * possible bit position that it is multiplied against.
	 * See http://graphics.stanford.edu/~seander/bithacks.html
	 * and http://chessprogramming.wikispaces.com/BitScan */
	return (unsigned)bit_pos[((uint32_t)((x & -x) * 0x077CB531U)) >> 27];
}
#  endif /* _MSC_VER >= 1300... */
#endif /* !(GCC >= 4.3) */

/**
 * Copy a memory block with simultaneous exchanging byte order.
 * The byte order is changed from little-endian 32-bit integers
 * to big-endian (or vice-versa).
 *
 * @param to the pointer where to copy memory block
 * @param index the index to start writing from
 * @param from  the source block to copy
 * @param length length of the memory block
 */
void rhash_swap_copy_str_to_u32(void* to, int index, const void* from, size_t length)
{
	/* if all pointers and length are 32-bits aligned */
	if ( 0 == (( (int)((char*)to - (char*)0) | ((char*)from - (char*)0) | index | length ) & 3) ) {
		/* copy memory as 32-bit words */
		const uint32_t* src = (const uint32_t*)from;
		const uint32_t* end = (const uint32_t*)((const char*)src + length);
		uint32_t* dst = (uint32_t*)((char*)to + index);
		while (src < end) *(dst++) = bswap_32( *(src++) );
	} else {
		const char* src = (const char*)from;
		for (length += index; (size_t)index < length; index++) ((char*)to)[index ^ 3] = *(src++);
	}
}

/**
 * Copy a memory block with changed byte order.
 * The byte order is changed from little-endian 64-bit integers
 * to big-endian (or vice-versa).
 *
 * @param to     the pointer where to copy memory block
 * @param index  the index to start writing from
 * @param from   the source block to copy
 * @param length length of the memory block
 */
void rhash_swap_copy_str_to_u64(void* to, int index, const void* from, size_t length)
{
	/* if all pointers and length are 64-bits aligned */
	if ( 0 == (( (int)((char*)to - (char*)0) | ((char*)from - (char*)0) | index | length ) & 7) ) {
		/* copy aligned memory block as 64-bit integers */
		const uint64_t* src = (const uint64_t*)from;
		const uint64_t* end = (const uint64_t*)((const char*)src + length);
		uint64_t* dst = (uint64_t*)((char*)to + index);
		while (src < end) *(dst++) = bswap_64( *(src++) );
	} else {
		const char* src = (const char*)from;
		for (length += index; (size_t)index < length; index++) ((char*)to)[index ^ 7] = *(src++);
	}
}

/**
 * Copy data from a sequence of 64-bit words to a binary string of given length,
 * while changing byte order.
 *
 * @param to     the binary string to receive data
 * @param from   the source sequence of 64-bit words
 * @param length the size in bytes of the data being copied
 */
void rhash_swap_copy_u64_to_str(void* to, const void* from, size_t length)
{
	/* if all pointers and length are 64-bits aligned */
	if ( 0 == (( (int)((char*)to - (char*)0) | ((char*)from - (char*)0) | length ) & 7) ) {
		/* copy aligned memory block as 64-bit integers */
		const uint64_t* src = (const uint64_t*)from;
		const uint64_t* end = (const uint64_t*)((const char*)src + length);
		uint64_t* dst = (uint64_t*)to;
		while (src < end) *(dst++) = bswap_64( *(src++) );
	} else {
		size_t index;
		char* dst = (char*)to;
		for (index = 0; index < length; index++) *(dst++) = ((char*)from)[index ^ 7];
	}
}

/**
 * Exchange byte order in the given array of 32-bit integers.
 *
 * @param arr    the array to process
 * @param length array length
 */
void rhash_u32_mem_swap(unsigned *arr, int length)
{
	unsigned* end = arr + length;
	for (; arr < end; arr++) {
		*arr = bswap_32(*arr);
	}
}


================================================
FILE: benchmarks/byte_order.h
================================================
/* byte_order.h */
#ifndef BYTE_ORDER_H
#define BYTE_ORDER_H
#include "ustd.h"
#include <stdlib.h>

#ifdef IN_RHASH
#include "config.h"
#endif

#ifdef __GLIBC__
# include <endian.h>
#endif

#ifdef __cplusplus
extern "C" {
#endif

/* if x86 compatible cpu */
#if defined(i386) || defined(__i386__) || defined(__i486__) || \
	defined(__i586__) || defined(__i686__) || defined(__pentium__) || \
	defined(__pentiumpro__) || defined(__pentium4__) || \
	defined(__nocona__) || defined(prescott) || defined(__core2__) || \
	defined(__k6__) || defined(__k8__) || defined(__athlon__) || \
	defined(__amd64) || defined(__amd64__) || \
	defined(__x86_64) || defined(__x86_64__) || defined(_M_IX86) || \
	defined(_M_AMD64) || defined(_M_IA64) || defined(_M_X64)
/* detect if x86-64 instruction set is supported */
# if defined(_LP64) || defined(__LP64__) || defined(__x86_64) || \
	defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
#  define CPU_X64
# else
#  define CPU_IA32
# endif
#endif


/* detect CPU endianness */
#if (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && \
		__BYTE_ORDER == __LITTLE_ENDIAN) || \
	defined(CPU_IA32) || defined(CPU_X64) || \
	defined(__ia64) || defined(__ia64__) || defined(__alpha__) || defined(_M_ALPHA) || \
	defined(vax) || defined(MIPSEL) || defined(_ARM_) || defined(__arm__)
# define CPU_LITTLE_ENDIAN
# define IS_BIG_ENDIAN 0
# define IS_LITTLE_ENDIAN 1
#elif (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && \
		__BYTE_ORDER == __BIG_ENDIAN) || \
	defined(__sparc) || defined(__sparc__) || defined(sparc) || \
	defined(_ARCH_PPC) || defined(_ARCH_PPC64) || defined(_POWER) || \
	defined(__POWERPC__) || defined(POWERPC) || defined(__powerpc) || \
	defined(__powerpc__) || defined(__powerpc64__) || defined(__ppc__) || \
	defined(__hpux)  || defined(_MIPSEB) || defined(mc68000) || \
	defined(__s390__) || defined(__s390x__) || defined(sel)
# define CPU_BIG_ENDIAN
# define IS_BIG_ENDIAN 1
# define IS_LITTLE_ENDIAN 0
#else
# error "Can't detect CPU architechture"
#endif

#define IS_ALIGNED_32(p) (0 == (3 & ((const char*)(p) - (const char*)0)))
#define IS_ALIGNED_64(p) (0 == (7 & ((const char*)(p) - (const char*)0)))

#if defined(_MSC_VER)
#define ALIGN_ATTR(n) __declspec(align(n))
#elif defined(__GNUC__)
#define ALIGN_ATTR(n) __attribute__((aligned (n)))
#else
#define ALIGN_ATTR(n) /* nothing */
#endif


#if defined(_MSC_VER) || defined(__BORLANDC__)
#define I64(x) x##ui64
#else
#define I64(x) x##LL
#endif

/* convert a hash flag to index */
#if __GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) /* GCC < 3.4 */
# define rhash_ctz(x) __builtin_ctz(x)
#else
unsigned rhash_ctz(unsigned); /* define as function */
#endif

void rhash_swap_copy_str_to_u32(void* to, int index, const void* from, size_t length);
void rhash_swap_copy_str_to_u64(void* to, int index, const void* from, size_t length);
void rhash_swap_copy_u64_to_str(void* to, const void* from, size_t length);
void rhash_u32_mem_swap(unsigned *p, int length_in_u32);

/* define bswap_32 */
#if defined(__GNUC__) && defined(CPU_IA32) && !defined(__i386__)
/* for intel x86 CPU */
static inline uint32_t bswap_32(uint32_t x) {
	__asm("bswap\t%0" : "=r" (x) : "0" (x));
	return x;
}
#elif defined(__GNUC__)  && (__GNUC__ >= 4) && (__GNUC__ > 4 || __GNUC_MINOR__ >= 3)
/* for GCC >= 4.3 */
# define bswap_32(x) __builtin_bswap32(x)
#elif (_MSC_VER > 1300) && (defined(CPU_IA32) || defined(CPU_X64)) /* MS VC */
# define bswap_32(x) _byteswap_ulong((unsigned long)x)
#elif !defined(__STRICT_ANSI__)
/* general bswap_32 definition */
static inline uint32_t bswap_32(uint32_t x) {
	x = ((x << 8) & 0xFF00FF00) | ((x >> 8) & 0x00FF00FF);
	return (x >> 16) | (x << 16);
}
#else
#define bswap_32(x) ((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >>  8) | \
	(((x) & 0x0000ff00) <<  8) | (((x) & 0x000000ff) << 24))
#endif /* bswap_32 */

#if defined(__GNUC__) && (__GNUC__ >= 4) && (__GNUC__ > 4 || __GNUC_MINOR__ >= 3)
# define bswap_64(x) __builtin_bswap64(x)
#elif (_MSC_VER > 1300) && (defined(CPU_IA32) || defined(CPU_X64)) /* MS VC */
# define bswap_64(x) _byteswap_uint64((__int64)x)
#elif !defined(__STRICT_ANSI__)
static inline uint64_t bswap_64(uint64_t x) {
	union {
		uint64_t ll;
		uint32_t l[2];
	} w, r;
	w.ll = x;
	r.l[0] = bswap_32(w.l[1]);
	r.l[1] = bswap_32(w.l[0]);
	return r.ll;
}
#else
#error "bswap_64 unsupported"
#endif

#ifdef CPU_BIG_ENDIAN
# define be2me_32(x) (x)
# define be2me_64(x) (x)
# define le2me_32(x) bswap_32(x)
# define le2me_64(x) bswap_64(x)

# define be32_copy(to, index, from, length) memcpy((to) + (index), (from), (length))
# define le32_copy(to, index, from, length) rhash_swap_copy_str_to_u32((to), (index), (from), (length))
# define be64_copy(to, index, from, length) memcpy((to) + (index), (from), (length))
# define le64_copy(to, index, from, length) rhash_swap_copy_str_to_u64((to), (index), (from), (length))
# define me64_to_be_str(to, from, length) memcpy((to), (from), (length))
# define me64_to_le_str(to, from, length) rhash_swap_copy_u64_to_str((to), (from), (length))

#else /* CPU_BIG_ENDIAN */
# define be2me_32(x) bswap_32(x)
# define be2me_64(x) bswap_64(x)
# define le2me_32(x) (x)
# define le2me_64(x) (x)

# define be32_copy(to, index, from, length) rhash_swap_copy_str_to_u32((to), (index), (from), (length))
# define le32_copy(to, index, from, length) memcpy((to) + (index), (from), (length))
# define be64_copy(to, index, from, length) rhash_swap_copy_str_to_u64((to), (index), (from), (length))
# define le64_copy(to, index, from, length) memcpy((to) + (index), (from), (length))
# define me64_to_be_str(to, from, length) rhash_swap_copy_u64_to_str((to), (from), (length))
# define me64_to_le_str(to, from, length) memcpy((to), (from), (length))
#endif /* CPU_BIG_ENDIAN */

/* ROTL/ROTR macros rotate a 32/64-bit word left/right by n bits */
#define ROTL32(dword, n) ((dword) << (n) ^ ((dword) >> (32 - (n))))
#define ROTR32(dword, n) ((dword) >> (n) ^ ((dword) << (32 - (n))))
#define ROTL64(qword, n) ((qword) << (n) ^ ((qword) >> (64 - (n))))
#define ROTR64(qword, n) ((qword) >> (n) ^ ((qword) << (64 - (n))))

#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */

#endif /* BYTE_ORDER_H */


================================================
FILE: benchmarks/chacha-prng.h
================================================
/* Copyright (c) 2016 Vladimir Makarov <vmakarov@gcc.gnu.org>

   Permission is hereby granted, free of charge, to any person
   obtaining a copy of this software and associated documentation
   files (the "Software"), to deal in the Software without
   restriction, including without limitation the rights to use, copy,
   modify, merge, publish, distribute, sublicense, and/or sell copies
   of the Software, and to permit persons to whom the Software is
   furnished to do so, subject to the following conditions:

   The above copyright notice and this permission notice shall be
   included in all copies or substantial portions of the Software.

   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   SOFTWARE.
*/

/* Pseudo Random Number Generator (PRNG) based on ChaCha stream cipher
   designed by D.J. Bernstein.  The code is ChaCha reference code
   (please see https://cr.yp.to/chacha.html) adapted for our purposes.

   It is a crypto level PRNG as the stream cypher on which it is
   based.

   To use a generator call `init_chacha_prng` first, then call
   `get_chacha_prn` as much as you want to get a new PRN.  At the end
   of the PRNG use, call `finish_chacha_prng`.  You can change the
   default seed by calling `set_chacha_prng_seed`.

   The PRNG passes NIST Statistical Test Suite for Random and
   Pseudorandom Number Generators for Cryptographic Applications
   (version 2.2.1) with 1000 bitstreams each containing 1M bits.

   The generation of a new number takes about 40 CPU cycles on x86_64
   (Intel 4.2GHz i7-4790K), or speed of the generation is about 106M
   numbers per sec.  */

#ifndef __CHACHA_PRNG__
#define __CHACHA_PRNG__

#ifdef _MSC_VER
typedef unsigned __int32 uint32_t;
typedef unsigned __int64 uint64_t;
#else
#include <stdint.h>
#endif

#include <stdlib.h>

static inline uint32_t
_chacha_prng_rotl (uint32_t v, int c) {
  return (v << c) | (v >> (32 - c));
}

/* ChaCha state transformation step.  */
static inline void
_chacha_prng_quarter_round (uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
  *a += *b; *d = _chacha_prng_rotl (*d ^ *a, 16);
  *c += *d; *b = _chacha_prng_rotl (*b ^ *c, 12);
  *a += *b; *d = _chacha_prng_rotl (*d ^ *a, 8);
  *c += *d; *b = _chacha_prng_rotl (*b ^ *c, 7);
}

/* Major ChaCha state transformation.  */
static inline void
_chacha_prng_salsa20 (uint32_t output[16], const uint32_t input[16]) {
  int i;

  for (i = 0; i < 16; i++)
    output[i] = input[i];
  for (i = 8; i > 0; i -= 2) {
    _chacha_prng_quarter_round (&output[0], &output[4], &output[8],&output[12]);
    _chacha_prng_quarter_round (&output[1], &output[5], &output[9],&output[13]);
    _chacha_prng_quarter_round (&output[2], &output[6],&output[10],&output[14]);
    _chacha_prng_quarter_round (&output[3], &output[7],&output[11],&output[15]);
    _chacha_prng_quarter_round (&output[0], &output[5],&output[10],&output[15]);
    _chacha_prng_quarter_round (&output[1], &output[6],&output[11],&output[12]);
    _chacha_prng_quarter_round (&output[2], &output[7], &output[8],&output[13]);
    _chacha_prng_quarter_round (&output[3], &output[4], &output[9],&output[14]);
  }
  for (i = 0; i < 16; i++)
    output[i] += input[i];
}

/* Internal state of the PRNG.  */
static struct {
  int ind; /* position in the output */
  /* The current PRNG state and parts of the recently generated
     numbers.  */
  uint32_t input[16], output[16];
} _chacha_prng_state;

/* Some random prime numbers.  */
static const uint32_t chacha_prng_primes[4] = {0xfa835867, 0x2086ca69, 0x1467c0fb, 0x638e2b99};

/* Internal function to set ChaCha PRNG seed by K and IV.  */
static inline void
_set_chacha_prng_key_iv (uint32_t k[8], uint32_t iv[2]) {
  int i;
  
  for (i = 0; i < 8; i++)
    _chacha_prng_state.input[i + 4] = k[i];
  for (i = 0; i < 4; i++)
    _chacha_prng_state.input[i] = chacha_prng_primes[i];
  _chacha_prng_state.input[12] = 0;
  _chacha_prng_state.input[13] = 0;
  _chacha_prng_state.input[14] = iv[0];
  _chacha_prng_state.input[15] = iv[1];
}

/* Internal function to initiate ChaCha PRNG with seed given by K and
   IV.  */
static inline void
_init_chacha_prng_with_key_iv (uint32_t k[8], uint32_t iv[2]) {
  _set_chacha_prng_key_iv (k, iv);
  _chacha_prng_state.ind = 16;
}

/* Initiate the PRNG with some random seed.  */
static inline void
init_chacha_prng (void) {
  int i;
  uint32_t k[8], iv[2];

  for (i = 0; i < 8; i++)
    k[i] = rand ();
  iv[0] = rand ();
  iv[1] = rand ();
  _init_chacha_prng_with_key_iv (k, iv);
}

/* Set ChaCha PRNG SEED.  */
static inline void
set_chacha_prng_seed (uint32_t seed) {
  static uint32_t k[8] = {0, 0, 0, 0};
  static uint32_t iv[2] = {0, 0};
  
  k[0] = seed;
  _set_chacha_prng_key_iv (k, iv);
}

/* Return the next pseudo-random number.  */
static inline uint64_t
get_chacha_prn (void)
{
  uint64_t res;

  for (;;) {
    if (_chacha_prng_state.ind < 16) {
      res = ((uint64_t) _chacha_prng_state.output[_chacha_prng_state.ind] << 32
	     | _chacha_prng_state.output[_chacha_prng_state.ind + 1]);
      _chacha_prng_state.ind += 2;
      return res;
    }
    _chacha_prng_state.ind = 0;
    _chacha_prng_salsa20 (_chacha_prng_state.output, _chacha_prng_state.input);
    _chacha_prng_state.input[12]++;
    if (! _chacha_prng_state.input[12])
      /* If it is becomming zero we produced too many numbers by
	 current PRNG.  */
      _chacha_prng_state.input[13]++;
  }
}

/* Empty function for our PRNGs interface.  */
static inline void
finish_chacha_prng (void) {
}

#endif


================================================
FILE: benchmarks/gen-table.rb
================================================
#!/usr/bin/ruby
# Take stdin and output the table
rows = []
cols = []
tab = {}
cur = ""
n = 0
STDIN.each_line do |line|
  puts line
  if md = /[+]+([0-9a-zA-Z-]+)/.match(line)
     rows.push(cur=line[md.begin(1)...md.end(1)])
     n += 1
  elsif md = /([0-9a-zA-Z-]+)\s*:\s*(\d+.\d+)s/.match(line)
     name=line[md.begin(1)...md.end(1)]
     cols.push(name) if n == 1
     tab[cur + name] = line[md.begin(2)...md.end(2)]
  end
end


def print_header(cols, mr, mc)
  print "|".ljust(mr, " ")
  cols.each do |e|
    print " | ", e.ljust(mc, " ")
  end
  print " |\n", ":".ljust(mr + 1, "-")
  cols.each do |e|
    print "|", ":".rjust(mc + 2, "-")
  end
  print "|\n"
end

mr = rows.map { |e| e.length}.max
mc = cols.map { |e| e.length}.max
mc = 11 if mc < 11

print_header(cols, mr, mc)

rows.each { |r|
  print "|", r.ljust(mr, " "), "| "
  min = 100000.0;
  cols.each { |c|
    min = tab[r + c].to_f if tab.has_key?(r + c) && tab[r + c].to_f < min
  }
  cols.each { |c|
    if ! tab.has_key?(r + c)
      print "-".ljust(mc - 4, " "), " | "
      continue
    end
    v = tab[r + c]
    print v.to_f == min ? "**" : "  "
    print v
    print v.to_f == min ? "**" : "  "
    print " ".ljust(mc - 4 - v.length, " ")
    print " | "
  }
  print "\n"
}


================================================
FILE: benchmarks/meow_hash.h
================================================
/* ========================================================================

   Meow - A Fast Non-cryptographic Hash
   (C) Copyright 2018 by Molly Rocket, Inc. (https://mollyrocket.com)
   
   See https://mollyrocket.com/meowhash for details.
   
   ========================================================================
   
   zlib License
   
   (C) Copyright 2018 Molly Rocket, Inc.
   
   This software is provided 'as-is', without any express or implied
   warranty.  In no event will the authors be held liable for any damages
   arising from the use of this software.
   
   Permission is granted to anyone to use this software for any purpose,
   including commercial applications, and to alter it and redistribute it
   freely, subject to the following restrictions:
   
   1. The origin of this software must not be misrepresented; you must not
      claim that you wrote the original software. If you use this software
      in a product, an acknowledgment in the product documentation would be
      appreciated but is not required.
   2. Altered source versions must be plainly marked as such, and must not be
      misrepresented as being the original software.
   3. This notice may not be removed or altered from any source distribution.
   
   ========================================================================
   
   FAQ
   
   Q: What is it?
   
   A: Meow is a 128-bit non-cryptographic hash that operates at high speeds
      on x64 and ARM processors that provide AES instructions.  It is
      designed to be truncatable to 64 and 32-bit hash values and still
      retain good collision resistance.
      
   Q: What is it GOOD for?
   
   A: Quickly hashing any amount of data for comparison purposes such as
      block deduplication or change detection.  It is extremely fast on
      all buffer sizes, from one byte to one gigabyte and up.
      
   Q: What is it BAD for?
   
   A: Anything security-related.  It should be assumed that it provides
      no protection from adversaries whatsoever.  It is also not particularly
      fast on processors that don't support AES instructions (eg., non-x64/ARM
      processors).
      
   Q: Why is it called the "Meow hash"?
   
   A: It is named after a character in Meow the Infinite
      (https://meowtheinfinite.com)
      
   Q: Who wrote it?
   
   A: CASEY MURATORI (https://caseymuratori.com) wrote the original
      implementation for use in processing large-footprint assets for
      the game 1935 (https://molly1935.com).
      
      After the initial version, the hash was refined via collaboration
      with several great programmers who contributed suggestions and
      modifications:
      
      JEFF ROBERTS (https://radgametools.com) provided a super slick
      way to handle the residual end-of-buffer bytes that dramatically
      improved Meow's small hash performance.
      
      MARTINS MOZEIKO (https://matrins.ninja) ported Meow to ARM and
      ANSI-C, and added the proper preprocessor dressing for clean
      compilation on a variety of compiler configurations.
      
      FABIAN GIESEN (https://fgiesen.wordpress.com) provided support
      for getting the benchmarking working properly across a number
      of platforms.
      
      ARAS PRANCKEVICIUS (https://aras-p.info) provided the allocation
      shim for compilation on Mac OS X.
      
   ========================================================================
   
   USAGE
   
   For a complete working example, see meow_example.cpp.  Briefly:
   
       // Include meow_intrinsics if you want it to detect platforms
       // and define types and intrinsics for you.  Omit it if you
       // want to define them yourself.
       #include "meow_intrinsics.h"
       
       // Include meow_hash for the Meow hash function
       #include "meow_hash.h"
       
       // Hash a block of data using CPU-specific acceleration
       meow_u128 MeowHash_Accelerated(u64 Seed, u64 Len, void *Source);
       
       // Check if two Meow hashes are the same
       // (returns zero if they aren't, non-zero if they are)
       int MeowHashesAreEqual(meow_u128 A, meow_u128 B)
       
       // Truncate a Meow hash to 64 bits
       meow_u64 MeowU64From(meow_u128 Hash);
       
       // Truncate a Meow hash to 32 bits
       meow_u32 MeowU32From(meow_u128 Hash);
       
   **** VERY IMPORTANT X64 COMPILATION NOTES ****
   
   On x64, Meow uses the AESDEC instruction, which comes in two flavors:
   SSE (aesdec) and AVX (vaesdec).  If you are compiling _with_ AVX support,
   your compiler will probably emit the AVX variant, which means your code
   WILL NOT RUN on computers that do not have AVX.  If you need to deploy
   this hash on computers that do not have AVX, you must take care to
   TURN OFF support for AVX in your compiler for the file that includes
   the Meow hash!
   
   ======================================================================== */

//
// NOTE(casey): This version is EXPERIMENTAL.  The Meow hash is still
// undergoing testing and finalization.
//
// **** EXPECT HASHES/APIs TO CHANGE UNTIL THE VERSION NUMBER HITS 1.0. ****
//
// You have been warned.
//

static const unsigned char MeowShiftAdjust[31] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128};
static const unsigned char MeowMaskLen[32] = {255,255,255,255, 255,255,255,255, 255,255,255,255, 255,255,255,255, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};

// TODO(casey): These constants are loaded to initialize the lanes.  Jacob should
// give us some feedback on what they should _actually_ be set to.
#define MEOW_S0_INIT { 0, 1, 2, 3,  4, 5, 6, 7,  8, 9,10,11, 12,13,14,15}
#define MEOW_S1_INIT {16,17,18,19, 20,21,22,23, 24,25,26,27, 28,29,30,31}
#define MEOW_S2_INIT {32,33,34,35, 36,37,38,39, 40,41,42,43, 44,45,46,47}
#define MEOW_S3_INIT {48,49,50,51, 52,53,54,55, 56,57,58,59, 60,61,62,63}
static const unsigned char MeowS0Init[] = MEOW_S0_INIT;
static const unsigned char MeowS1Init[] = MEOW_S1_INIT;
static const unsigned char MeowS2Init[] = MEOW_S2_INIT;
static const unsigned char MeowS3Init[] = MEOW_S3_INIT;

//
// NOTE(casey): 128-wide AES-NI Meow (maximum of 16 bytes/clock single threaded)
//

static meow_hash
MeowHash_Accelerated(meow_u64 Seed, meow_u64 TotalLengthInBytes, void *SourceInit)
{
    //
    // NOTE(casey): Initialize the four AES streams and the mixer
    //
    
    meow_aes_128 S0 = Meow128_GetAESConstant(MeowS0Init);
    meow_aes_128 S1 = Meow128_GetAESConstant(MeowS1Init);
    meow_aes_128 S2 = Meow128_GetAESConstant(MeowS2Init);
    meow_aes_128 S3 = Meow128_GetAESConstant(MeowS3Init);
    
    meow_u128 Mixer = Meow128_Set64x2(Seed - TotalLengthInBytes,
                                      Seed + TotalLengthInBytes + 1);
    
    //
    // NOTE(casey): Handle as many full 256-byte blocks as possible
    //
    
    meow_u8 *Source = (meow_u8 *)SourceInit;
    meow_u64 Len = TotalLengthInBytes;
    int unsigned Len8 = Len & 15;
    int unsigned Len128 = Len & 48;
    
    while(Len >= 64)
    {
        S0 = Meow128_AESDEC_Mem(S0, Source);
        S1 = Meow128_AESDEC_Mem(S1, Source + 16);
        S2 = Meow128_AESDEC_Mem(S2, Source + 32);
        S3 = Meow128_AESDEC_Mem(S3, Source + 48);
        
        Len -= 64;
        Source += 64;
    }
    
    //
    // NOTE(casey): Overhanging individual bytes
    //
    
    if(Len8)
    {
        meow_u8 *Overhang = Source + Len128;
        int Align = ((int)(meow_umm)Overhang) & 15;
        if(Align)
        {
            int End = ((int)(meow_umm)Overhang) & (MEOW_PAGESIZE - 1);
        
            // NOTE(jeffr): If we are nowhere near the page end, use full unaligned load (cmov to set)
            if (End <= (MEOW_PAGESIZE - 16))
            {
                Align = 0;
            }
            
            // NOTE(jeffr): If we will read over the page end, use a full unaligned load (cmov to set)
            if ((End + Len8) > MEOW_PAGESIZE)
            {
                Align = 0;
            }
            
            meow_u128 Partial = Meow128_Shuffle_Mem(Overhang - Align, &MeowShiftAdjust[Align]);
            
            Partial = Meow128_And_Mem( Partial, &MeowMaskLen[16 - Len8] );
            S3 = Meow128_AESDEC(S3, Partial);
        }
        else
        {
            // NOTE(casey): We don't have to do Jeff's heroics when we know the
            // buffer is aligned, since we cannot span a memory page (by definition).
            meow_u128 Partial = Meow128_And_Mem(*(meow_u128 *)Overhang, &MeowMaskLen[16 - Len8]);
            S3 = Meow128_AESDEC(S3, Partial);
        }
    }
    
    //
    // NOTE(casey): Overhanging full 128-bit lanes
    //
    
    switch(Len128)
    {
        case 48: S2 = Meow128_AESDEC_Mem(S2, Source + 32);
        case 32: S1 = Meow128_AESDEC_Mem(S1, Source + 16);
        case 16: S0 = Meow128_AESDEC_Mem(S0, Source);
    }
    
    //
    // NOTE(casey): Mix the four lanes down to one 128-bit hash
    //
    
    S3 = Meow128_AESDEC(S3, Mixer);
    S2 = Meow128_AESDEC(S2, Mixer);
    S1 = Meow128_AESDEC(S1, Mixer);
    S0 = Meow128_AESDEC(S0, Mixer);
    
    S2 = Meow128_AESDEC(S2, Meow128_AESDEC_Finalize(S3));
    S0 = Meow128_AESDEC(S0, Meow128_AESDEC_Finalize(S1));
    
    S2 = Meow128_AESDEC(S2, Mixer);
    
    S0 = Meow128_AESDEC(S0, Meow128_AESDEC_Finalize(S2));
    S0 = Meow128_AESDEC(S0, Mixer);
    
    meow_hash Result;
    Meow128_CopyToHash(Meow128_AESDEC_Finalize(S0), Result);
    
    return(Result);
}


================================================
FILE: benchmarks/meow_intrinsics.h
================================================
/* ========================================================================

   meow_intrinsics.h
   (C) Copyright 2018 by Molly Rocket, Inc. (https://mollyrocket.com)
   
   See https://mollyrocket.com/meowhash for details.
   
   This is the default way to define all of the types and operations that
   meow_hash.h needs.  However, if you've got your _own_ equivalent type
   definitions and intrinsics, you can _omit_ this header file and just
   #define/typedef all the Meow ops to map to your own ops, keeping things
   nice and uniform in your codebase.
   
   ======================================================================== */

#if !defined(MEOW_HASH_INTRINSICS_H)

//
// NOTE(casey): Try to guess the source file for compiler intrinsics
//
#if _MSC_VER

#if _M_AMD64 || _M_IX86
#include <intrin.h>
#elif _M_ARM64
#include <arm64_neon.h>
#endif

#else

#if __x86_64__ || __i386__
#include <x86intrin.h>
#elif __aarch64__
#include <arm_neon.h>
#endif

#endif

//
// NOTE(casey): Set #define's to their defaults
//

#if !defined(MEOW_HASH_INTEL) || !defined(MEOW_HASH_ARMV8)
#if __x86_64__ || _M_AMD64
#define MEOW_HASH_INTEL 1
#define MEOW_64BIT 1
#define MEOW_PAGESIZE 4096
#elif __i386__  || _M_IX86
#define MEOW_HASH_INTEL 1
#define MEOW_64BIT 0
#define MEOW_PAGESIZE 4096
#elif __aarch64__ || _M_ARM64
#define MEOW_HASH_ARMV8 1
#define MEOW_64BIT 1
#define MEOW_PAGESIZE 4096
#else
#error Cannot determine architecture to use!
#endif
#endif

//
// NOTE(casey): Define basic types
//

#define meow_u8 char unsigned
#define meow_u16 short unsigned
#define meow_u32 int unsigned
#define meow_u64 long long unsigned

#if MEOW_64BIT
#define meow_umm long long unsigned
#else
#define meow_umm int unsigned
#endif

//
// NOTE(casey): Operations for x64 processors
//

#if MEOW_HASH_INTEL

#define meow_u128 __m128i
#define meow_aes_128 __m128i
#define meow_u256 __m256i
#define meow_aes_256 __m256i
#define meow_u512 __m512i
#define meow_aes_512 __m512i

#define MeowU32From(A, I) (_mm_extract_epi32((A), (I)))
#define MeowU64From(A, I) (_mm_extract_epi64((A), (I)))
#define MeowHashesAreEqual(A, B) (_mm_movemask_epi8(_mm_cmpeq_epi8((A), (B))) == 0xFFFF)

#define Meow128_AESDEC(Prior, Xor) _mm_aesdec_si128((Prior), (Xor))
#define Meow128_AESDEC_Mem(Prior, Xor) _mm_aesdec_si128((Prior), _mm_loadu_si128((meow_u128 *)(Xor)))
#define Meow128_AESDEC_Finalize(A) (A)
#define Meow128_Set64x2(Low64, High64) _mm_set_epi64x((High64), (Low64))
#define Meow128_Set64x2_State(Low64, High64) Meow128_Set64x2(Low64, High64)
#define Meow128_GetAESConstant(Ptr) (*(meow_u128 *)(Ptr))

#define Meow128_And_Mem(A,B) _mm_and_si128((A),_mm_loadu_si128((meow_u128 *)(B)))
#define Meow128_Shuffle_Mem(Mem,Control) _mm_shuffle_epi8(_mm_loadu_si128((meow_u128 *)(Mem)),_mm_loadu_si128((meow_u128 *)(Control)))

// TODO(casey): Not sure if this should actually be Meow128_Zero(A) ((A) = _mm_setzero_si128()), maybe
#define Meow128_Zero() _mm_setzero_si128()

#define Meow256_AESDEC(Prior, XOr) _mm256_aesdec_epi128((Prior), (XOr))
#define Meow256_AESDEC_Mem(Prior, XOr) _mm256_aesdec_epi128((Prior), *(meow_u256 *)(XOr))
#define Meow256_Zero() _mm256_setzero_si256()
#define Meow256_PartialLoad(A, B) _mm256_mask_loadu_epi8(_mm256_setzero_si256(), _cvtu32_mask32((1UL<<(B)) - 1), (A))
#define Meow128_FromLow(A) _mm256_extracti128_si256((A), 0)
#define Meow128_FromHigh(A) _mm256_extracti128_si256((A), 1)

#define Meow512_AESDEC(Prior, XOr) _mm512_aesdec_epi128((Prior), (XOr))
#define Meow512_AESDEC_Mem(Prior, XOr) _mm512_aesdec_epi128((Prior), *(meow_u512 *)(XOr))
#define Meow512_Zero() _mm512_setzero_si512()
#define Meow512_PartialLoad(A, B) _mm512_mask_loadu_epi8(_mm512_setzero_si512(), _cvtu64_mask64((1ULL<<(B)) - 1), (A))
#define Meow256_FromLow(A) _mm512_extracti64x4_epi64((A), 0)
#define Meow256_FromHigh(A) _mm512_extracti64x4_epi64((A), 1)

//
// NOTE(casey): Operations for ARM processors
//

#elif MEOW_HASH_ARMV8

#define meow_u128 uint8x16_t

// NOTE(mmozeiko): AES opcodes on ARMv8 work a bit differently than on Intel
// On Intel the "x = AESDEC(x, m)" does following:
//   x = InvMixColumns(SubBytes(ShiftRows(x))) ^ m
// But on ARMv8 the "x = AESDEC(x, m)" does following:
//   x = SubBytes(ShiftRows(x ^ m))
// Thus on ARMv8 it requires extra InvMixColumns call and delay on Xor operation.
// On iteration N it needs to use m[N-1] as input, and remeber m[N] for next iteration.
// This structure will store memory operand in member B which will be used in
// next AESDEC opcode. Remember to do one more XOR(A,B) when finishing AES
// operations in a loop.
typedef struct {
    meow_u128 A;
    meow_u128 B;
} meow_aes_128;

#define MeowU32From(A, I) (vgetq_lane_u32(vreinterpretq_u32_u8((A)), (I)))
#define MeowU64From(A, I) (vgetq_lane_u64(vreinterpretq_u64_u8((A)), (I)))

static int
MeowHashesAreEqualImpl(meow_u128 A, meow_u128 B)
{
    uint8x16_t Powers = {
        1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128,
    };

    uint8x16_t Input = vceqq_u8(A, B);
    uint64x2_t Mask = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(Input, Powers))));

    meow_u16 Output;
    vst1q_lane_u8((meow_u8*)&Output + 0, vreinterpretq_u8_u64(Mask), 0);
    vst1q_lane_u8((meow_u8*)&Output + 1, vreinterpretq_u8_u64(Mask), 8);
    return Output == 0xFFFF;
}

#define MeowHashesAreEqual(A, B) MeowHashesAreEqualImpl((A), (B))

static meow_aes_128
Meow128_AESDEC(meow_aes_128 Prior, meow_u128 Xor)
{
    meow_aes_128 R;
    R.A = vaesimcq_u8(vaesdq_u8(Prior.A, Prior.B));
    R.B = Xor;
    return(R);
}

static meow_aes_128
Meow128_AESDEC_Mem(meow_aes_128 Prior, void *Xor)
{
    meow_aes_128 R;
    R.A = vaesimcq_u8(vaesdq_u8(Prior.A, Prior.B));
    R.B = vld1q_u8((meow_u8*)Xor);
    return(R);
}

static meow_u128
Meow128_AESDEC_Finalize(meow_aes_128 Value)
{
    meow_u128 R = veorq_u8(Value.A, Value.B);
    return(R);
}

static meow_u128
Meow128_Zero()
{
    meow_u128 R = vdupq_n_u8(0);
    return(R);
}

static meow_aes_128
Meow128_GetAESConstant(const meow_u8 *Ptr)
{
    meow_aes_128 R;
    R.A = vld1q_u8(Ptr);
    R.B = vdupq_n_u8(0);
    return(R);
}

static meow_u128
Meow128_Set64x2(meow_u64 Low64, meow_u64 High64)
{
   meow_u128 R = vreinterpretq_u8_u64(vcombine_u64(vcreate_u64(Low64), vcreate_u64(High64)));
   return(R);
}

static meow_aes_128
Meow128_Set64x2_State(meow_u64 Low64, meow_u64 High64)
{
   meow_aes_128 R;
   R.A = Meow128_Set64x2(Low64, High64);
   R.B = Meow128_Zero();
   return(R);
}

#define Meow128_And_Mem(A,B) vandq_u8((A), vld1q_u8((meow_u8 *)B))
#define Meow128_Shuffle_Mem(Mem,Control) vqtbl1q_u8(vld1q_u8((meow_u8 *)(Mem)),vld1q_u8((meow_u8 *)(Control)))

#endif

#define MEOW_HASH_VERSION 4
#define MEOW_HASH_VERSION_NAME "0.4/himalayan"

#if MEOW_INCLUDE_C

// NOTE(casey): Unfortunately, if you want an ANSI-C version, we have to slow everyone
// else down because you can't return 128-bit values by register anymore (in case the
// CPU doesn't support that)
union meow_hash
{
    meow_u128 u128;
    meow_u64 u64[2];
    meow_u32 u32[4];
};
#define Meow128_CopyToHash(A, B) ((B).u128 = (A))

#undef MeowU64From
#undef MeowU32From
#undef MeowHashesAreEqual
#define MeowU32From(A, I) ((A).u32[I])
#define MeowU64From(A, I) ((A).u64[I])
#define MeowHashesAreEqual(A, B) (((A).u32[0] == (B).u32[0]) && ((A).u32[1] == (B).u32[1]) && ((A).u32[2] == (B).u32[2]) && ((A).u32[3] == (B).u32[3]))

#else

typedef meow_u128 meow_hash;
#define Meow128_CopyToHash(A, B) ((B) = (A))

#endif

typedef struct meow_hash_state meow_hash_state;
typedef meow_hash meow_hash_implementation(meow_u64 Seed, meow_u64 Len, void *Source);
typedef void meow_absorb_implementation(struct meow_hash_state *State, meow_u64 Len, void *Source);

#define MEOW_HASH_INTRINSICS_H
#endif


================================================
FILE: benchmarks/metrohash64.cpp
================================================
// metrohash64.cpp
//
// Copyright 2015-2018 J. Andrew Rogers
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "platform.h"
#include "metrohash64.h"

#include <cstring>

const char * MetroHash64::test_string = "012345678901234567890123456789012345678901234567890123456789012";

const uint8_t MetroHash64::test_seed_0[8] =   { 0x6B, 0x75, 0x3D, 0xAE, 0x06, 0x70, 0x4B, 0xAD };
const uint8_t MetroHash64::test_seed_1[8] =   { 0x3B, 0x0D, 0x48, 0x1C, 0xF4, 0xB9, 0xB8, 0xDF };


MetroHash64::MetroHash64(const uint64_t seed)
{
    Initialize(seed);
}


void MetroHash64::Initialize(const uint64_t seed)
{
    vseed = (static_cast<uint64_t>(seed) + k2) * k0;

    // initialize internal hash registers
    state.v[0] = vseed;
    state.v[1] = vseed;
    state.v[2] = vseed;
    state.v[3] = vseed;

    // initialize total length of input
    bytes = 0;
}


void MetroHash64::Update(const uint8_t * const buffer, const uint64_t length)
{
    const uint8_t * ptr = reinterpret_cast<const uint8_t*>(buffer);
    const uint8_t * const end = ptr + length;

    // input buffer may be partially filled
    if (bytes % 32)
    {
        uint64_t fill = 32 - (bytes % 32);
        if (fill > length)
            fill = length;

        memcpy(input.b + (bytes % 32), ptr, static_cast<size_t>(fill));
        ptr   += fill;
        bytes += fill;
        
        // input buffer is still partially filled
        if ((bytes % 32) != 0) return;

        // process full input buffer
        state.v[0] += read_u64(&input.b[ 0]) * k0; state.v[0] = rotate_right(state.v[0],29) + state.v[2];
        state.v[1] += read_u64(&input.b[ 8]) * k1; state.v[1] = rotate_right(state.v[1],29) + state.v[3];
        state.v[2] += read_u64(&input.b[16]) * k2; state.v[2] = rotate_right(state.v[2],29) + state.v[0];
        state.v[3] += read_u64(&input.b[24]) * k3; state.v[3] = rotate_right(state.v[3],29) + state.v[1];
    }
    
    // bulk update
    bytes += static_cast<uint64_t>(end - ptr);
    while (ptr <= (end - 32))
    {
        // process directly from the source, bypassing the input buffer
        state.v[0] += read_u64(ptr) * k0; ptr += 8; state.v[0] = rotate_right(state.v[0],29) + state.v[2];
        state.v[1] += read_u64(ptr) * k1; ptr += 8; state.v[1] = rotate_right(state.v[1],29) + state.v[3];
        state.v[2] += read_u64(ptr) * k2; ptr += 8; state.v[2] = rotate_right(state.v[2],29) + state.v[0];
        state.v[3] += read_u64(ptr) * k3; ptr += 8; state.v[3] = rotate_right(state.v[3],29) + state.v[1];
    }
    
    // store remaining bytes in input buffer
    if (ptr < end)
        memcpy(input.b, ptr, static_cast<size_t>(end - ptr));
}


void MetroHash64::Finalize(uint8_t * const hash)
{
    // finalize bulk loop, if used
    if (bytes >= 32)
    {
        state.v[2] ^= rotate_right(((state.v[0] + state.v[3]) * k0) + state.v[1], 37) * k1;
        state.v[3] ^= rotate_right(((state.v[1] + state.v[2]) * k1) + state.v[0], 37) * k0;
        state.v[0] ^= rotate_right(((state.v[0] + state.v[2]) * k0) + state.v[3], 37) * k1;
        state.v[1] ^= rotate_right(((state.v[1] + state.v[3]) * k1) + state.v[2], 37) * k0;

        state.v[0] = vseed + (state.v[0] ^ state.v[1]);
    }
    
    // process any bytes remaining in the input buffer
    const uint8_t * ptr = reinterpret_cast<const uint8_t*>(input.b);
    const uint8_t * const end = ptr + (bytes % 32);
    
    if ((end - ptr) >= 16)
    {
        state.v[1]  = state.v[0] + (read_u64(ptr) * k2); ptr += 8; state.v[1] = rotate_right(state.v[1],29) * k3;
        state.v[2]  = state.v[0] + (read_u64(ptr) * k2); ptr += 8; state.v[2] = rotate_right(state.v[2],29) * k3;
        state.v[1] ^= rotate_right(state.v[1] * k0, 21) + state.v[2];
        state.v[2] ^= rotate_right(state.v[2] * k3, 21) + state.v[1];
        state.v[0] += state.v[2];
    }

    if ((end - ptr) >= 8)
    {
        state.v[0] += read_u64(ptr) * k3; ptr += 8;
        state.v[0] ^= rotate_right(state.v[0], 55) * k1;
    }

    if ((end - ptr) >= 4)
    {
        state.v[0] += read_u32(ptr) * k3; ptr += 4;
        state.v[0] ^= rotate_right(state.v[0], 26) * k1;
    }

    if ((end - ptr) >= 2)
    {
        state.v[0] += read_u16(ptr) * k3; ptr += 2;
        state.v[0] ^= rotate_right(state.v[0], 48) * k1;
    }

    if ((end - ptr) >= 1)
    {
        state.v[0] += read_u8 (ptr) * k3;
        state.v[0] ^= rotate_right(state.v[0], 37) * k1;
    }
    
    state.v[0] ^= rotate_right(state.v[0], 28);
    state.v[0] *= k0;
    state.v[0] ^= rotate_right(state.v[0], 29);

    bytes = 0;

    // do any endian conversion here

    memcpy(hash, state.v, 8);
}


void MetroHash64::Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed)
{
    const uint8_t * ptr = reinterpret_cast<const uint8_t*>(buffer);
    const uint8_t * const end = ptr + length;

    uint64_t h = (static_cast<uint64_t>(seed) + k2) * k0;

    if (length >= 32)
    {
        uint64_t v[4];
        v[0] = h;
        v[1] = h;
        v[2] = h;
        v[3] = h;

        do
        {
            v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2];
            v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3];
            v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0];
            v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1];
        }
        while (ptr <= (end - 32));

        v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 37) * k1;
        v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 37) * k0;
        v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 37) * k1;
        v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 37) * k0;
        h += v[0] ^ v[1];
    }

    if ((end - ptr) >= 16)
    {
        uint64_t v0 = h + (read_u64(ptr) * k2); ptr += 8; v0 = rotate_right(v0,29) * k3;
        uint64_t v1 = h + (read_u64(ptr) * k2); ptr += 8; v1 = rotate_right(v1,29) * k3;
        v0 ^= rotate_right(v0 * k0, 21) + v1;
        v1 ^= rotate_right(v1 * k3, 21) + v0;
        h += v1;
    }

    if ((end - ptr) >= 8)
    {
        h += read_u64(ptr) * k3; ptr += 8;
        h ^= rotate_right(h, 55) * k1;
    }

    if ((end - ptr) >= 4)
    {
        h += read_u32(ptr) * k3; ptr += 4;
        h ^= rotate_right(h, 26) * k1;
    }

    if ((end - ptr) >= 2)
    {
        h += read_u16(ptr) * k3; ptr += 2;
        h ^= rotate_right(h, 48) * k1;
    }

    if ((end - ptr) >= 1)
    {
        h += read_u8 (ptr) * k3;
        h ^= rotate_right(h, 37) * k1;
    }

    h ^= rotate_right(h, 28);
    h *= k0;
    h ^= rotate_right(h, 29);

    memcpy(hash, &h, 8);
}


bool MetroHash64::ImplementationVerified()
{
    uint8_t hash[8];
    const uint8_t * key = reinterpret_cast<const uint8_t *>(MetroHash64::test_string);

    // verify one-shot implementation
    MetroHash64::Hash(key, strlen(MetroHash64::test_string), hash, 0);
    if (memcmp(hash, MetroHash64::test_seed_0, 8) != 0) return false;

    MetroHash64::Hash(key, strlen(MetroHash64::test_string), hash, 1);
    if (memcmp(hash, MetroHash64::test_seed_1, 8) != 0) return false;

    // verify incremental implementation
    MetroHash64 metro;
    
    metro.Initialize(0);
    metro.Update(reinterpret_cast<const uint8_t *>(MetroHash64::test_string), strlen(MetroHash64::test_string));
    metro.Finalize(hash);
    if (memcmp(hash, MetroHash64::test_seed_0, 8) != 0) return false;

    metro.Initialize(1);
    metro.Update(reinterpret_cast<const uint8_t *>(MetroHash64::test_string), strlen(MetroHash64::test_string));
    metro.Finalize(hash);
    if (memcmp(hash, MetroHash64::test_seed_1, 8) != 0) return false;

    return true;
}


void metrohash64_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out)
{
    static const uint64_t k0 = 0xC83A91E1;
    static const uint64_t k1 = 0x8648DBDB;
    static const uint64_t k2 = 0x7BDEC03B;
    static const uint64_t k3 = 0x2F5870A5;

    const uint8_t * ptr = reinterpret_cast<const uint8_t*>(key);
    const uint8_t * const end = ptr + len;
    
    uint64_t hash = ((static_cast<uint64_t>(seed) + k2) * k0) + len;
    
    if (len >= 32)
    {
        uint64_t v[4];
        v[0] = hash;
        v[1] = hash;
        v[2] = hash;
        v[3] = hash;
        
        do
        {
            v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2];
            v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3];
            v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0];
            v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1];
        }
        while (ptr <= (end - 32));

        v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 33) * k1;
        v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 33) * k0;
        v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 33) * k1;
        v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 33) * k0;
        hash += v[0] ^ v[1];
    }
    
    if ((end - ptr) >= 16)
    {
        uint64_t v0 = hash + (read_u64(ptr) * k0); ptr += 8; v0 = rotate_right(v0,33) * k1;
        uint64_t v1 = hash + (read_u64(ptr) * k1); ptr += 8; v1 = rotate_right(v1,33) * k2;
        v0 ^= rotate_right(v0 * k0, 35) + v1;
        v1 ^= rotate_right(v1 * k3, 35) + v0;
        hash += v1;
    }
    
    if ((end - ptr) >= 8)
    {
        hash += read_u64(ptr) * k3; ptr += 8;
        hash ^= rotate_right(hash, 33) * k1;
        
    }
    
    if ((end - ptr) >= 4)
    {
        hash += read_u32(ptr) * k3; ptr += 4;
        hash ^= rotate_right(hash, 15) * k1;
    }
    
    if ((end - ptr) >= 2)
    {
        hash += read_u16(ptr) * k3; ptr += 2;
        hash ^= rotate_right(hash, 13) * k1;
    }
    
    if ((end - ptr) >= 1)
    {
        hash += read_u8 (ptr) * k3;
        hash ^= rotate_right(hash, 25) * k1;
    }
    
    hash ^= rotate_right(hash, 33);
    hash *= k0;
    hash ^= rotate_right(hash, 33);

    memcpy(out, &hash, 8);
}


void metrohash64_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out)
{
    static const uint64_t k0 = 0xD6D018F5;
    static const uint64_t k1 = 0xA2AA033B;
    static const uint64_t k2 = 0x62992FC1;
    static const uint64_t k3 = 0x30BC5B29; 

    const uint8_t * ptr = reinterpret_cast<const uint8_t*>(key);
    const uint8_t * const end = ptr + len;
    
    uint64_t hash = ((static_cast<uint64_t>(seed) + k2) * k0) + len;
    
    if (len >= 32)
    {
        uint64_t v[4];
        v[0] = hash;
        v[1] = hash;
        v[2] = hash;
        v[3] = hash;
        
        do
        {
            v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2];
            v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3];
            v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0];
            v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1];
        }
        while (ptr <= (end - 32));

        v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 30) * k1;
        v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 30) * k0;
        v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 30) * k1;
        v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 30) * k0;
        hash += v[0] ^ v[1];
    }
    
    if ((end - ptr) >= 16)
    {
        uint64_t v0 = hash + (read_u64(ptr) * k2); ptr += 8; v0 = rotate_right(v0,29) * k3;
        uint64_t v1 = hash + (read_u64(ptr) * k2); ptr += 8; v1 = rotate_right(v1,29) * k3;
        v0 ^= rotate_right(v0 * k0, 34) + v1;
        v1 ^= rotate_right(v1 * k3, 34) + v0;
        hash += v1;
    }
    
    if ((end - ptr) >= 8)
    {
        hash += read_u64(ptr) * k3; ptr += 8;
        hash ^= rotate_right(hash, 36) * k1;
    }
    
    if ((end - ptr) >= 4)
    {
        hash += read_u32(ptr) * k3; ptr += 4;
        hash ^= rotate_right(hash, 15) * k1;
    }
    
    if ((end - ptr) >= 2)
    {
        hash += read_u16(ptr) * k3; ptr += 2;
        hash ^= rotate_right(hash, 15) * k1;
    }
    
    if ((end - ptr) >= 1)
    {
        hash += read_u8 (ptr) * k3;
        hash ^= rotate_right(hash, 23) * k1;
    }
    
    hash ^= rotate_right(hash, 28);
    hash *= k0;
    hash ^= rotate_right(hash, 29);

    memcpy(out, &hash, 8);
}


================================================
FILE: benchmarks/metrohash64.h
================================================
// metrohash64.h
//
// Copyright 2015-2018 J. Andrew Rogers
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef METROHASH_METROHASH_64_H
#define METROHASH_METROHASH_64_H

#include <stdint.h>

class MetroHash64
{
public:
    static const uint32_t bits = 64;

    // Constructor initializes the same as Initialize()
    MetroHash64(const uint64_t seed=0);
    
    // Initializes internal state for new hash with optional seed
    void Initialize(const uint64_t seed=0);
    
    // Update the hash state with a string of bytes. If the length
    // is sufficiently long, the implementation switches to a bulk
    // hashing algorithm directly on the argument buffer for speed.
    void Update(const uint8_t * buffer, const uint64_t length);
    
    // Constructs the final hash and writes it to the argument buffer.
    // After a hash is finalized, this instance must be Initialized()-ed
    // again or the behavior of Update() and Finalize() is undefined.
    void Finalize(uint8_t * const hash);
    
    // A non-incremental function implementation. This can be significantly
    // faster than the incremental implementation for some usage patterns.
    static void Hash(const uint8_t * buffer, const uint64_t length, uint8_t * const hash, const uint64_t seed=0);

    // Does implementation correctly execute test vectors?
    static bool ImplementationVerified();

    // test vectors -- Hash(test_string, seed=0) => test_seed_0
    static const char * test_string;
    static const uint8_t test_seed_0[8];
    static const uint8_t test_seed_1[8];

private:
    static const uint64_t k0 = 0xD6D018F5;
    static const uint64_t k1 = 0xA2AA033B;
    static const uint64_t k2 = 0x62992FC1;
    static const uint64_t k3 = 0x30BC5B29;
    
    struct { uint64_t v[4]; } state;
    struct { uint8_t b[32]; } input;
    uint64_t bytes;
    uint64_t vseed;
};


// Legacy 64-bit hash functions -- do not use
void metrohash64_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out);
void metrohash64_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out);


#endif // #ifndef METROHASH_METROHASH_64_H


================================================
FILE: benchmarks/mum512-prng.h
================================================
/* Copyright (c) 2016 Vladimir Makarov <vmakarov@gcc.gnu.org>

   Permission is hereby granted, free of charge, to any person
   obtaining a copy of this software and associated documentation
   files (the "Software"), to deal in the Software without
   restriction, including without limitation the rights to use, copy,
   modify, merge, publish, distribute, sublicense, and/or sell copies
   of the Software, and to permit persons to whom the Software is
   furnished to do so, subject to the following conditions:

   The above copyright notice and this permission notice shall be
   included in all copies or substantial portions of the Software.

   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   SOFTWARE.
*/

/* Pseudo Random Number Generator (PRNG) based on MUM512 hash
   function.  It might be a crypto level PRNG.

   To use a generator call `init_mum512_prng` first, then call
   `get_mum512_prn` as much as you want to get a new PRN.  At the end
   of the PRNG use, call `finish_mum512_prng`.  You can change the
   default seed by calling set_mum512_seed.

   The PRNG passes NIST Statistical Test Suite for Random and
   Pseudorandom Number Generators for Cryptographic Applications
   (version 2.2.1) with 1000 bitstreams each containing 1M bits.

   The generation of a new number takes about 62 CPU cycles on x86_64
   (Intel 4.2GHz i7-4790K), or speed of the generation is about 67M
   numbers per sec. */

#ifndef __MUM512_PRNG__
#define __MUM512_PRNG__

#include "mum512.h"

/* Default seed (initial state).  */
static uint64_t mum512_start_seed[4][2] = {
    {0Xc0e484a76bbdbf2bULL, 0Xea0a397c33eb3fafULL},
    {0Xf445f7c55f41d3e6ULL, 0X3bd8dc401433c0ddULL},
    {0Xf35d8101c6625902ULL, 0X4dfa7e49a5c0fbf9ULL},
    {0X69695bb2eec9914bULL, 0Xac3cb4a350e58879ULL},
};     

static struct {
  /* An index in the output used to generate the current PRN. */
  int ind;
  /* count from which the current state generated.  */
  _mc_ti count;
  /* The current MUM512 state.  */
  _mc_ti state[4];
  /* The digest used to generate the next 8 PRNs.  */
  uint64_t output[8]; 
} mum512_prng_state;

static inline void
init_mum512_prng (void) {
  int i;

  mum512_prng_state.ind = 0; mum512_prng_state.count = _mc_const (0, 0);
  for (i = 0; i < 4; i++)
    mum512_prng_state.state[i] = _mc_get (mum512_start_seed[i]);
}

/* Make the state equal to {SEED, 0, ..., 0}.  */
static inline void
set_mum512_seed (uint32_t seed) {
  int i;
  
  mum512_prng_state.state[0] = _mc_const (seed, 0);
  for (i = 1; i < 4; i++)
    mum512_prng_state.state[i] = _mc_const (0, 0);
}


static inline uint64_t
get_mum512_prn (void) {
  uint64_t res;
  
  if (mum512_prng_state.ind == 0) {
    mum512_prng_state.count = _mc_add (mum512_prng_state.count, _mc_const (0, 0));
    _mc_hash_aligned (mum512_prng_state.state, &mum512_prng_state.count, 16);
    _mc_mix (mum512_prng_state.state, mum512_prng_state.output);
  }
  res = mum512_prng_state.output[mum512_prng_state.ind];
  mum512_prng_state.ind = (mum512_prng_state.ind + 1) & 0x7;
  return res;
}

static inline void
finish_mum512_prng (void) {
}

#endif


================================================
FILE: benchmarks/platform.h
================================================
// platform.h
//
// The MIT License (MIT)
//
// Copyright (c) 2015 J. Andrew Rogers
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//

#ifndef METROHASH_PLATFORM_H
#define METROHASH_PLATFORM_H

#include <stdint.h>

// rotate right idiom recognized by most compilers
inline static uint64_t rotate_right(uint64_t v, unsigned k)
{
    return (v >> k) | (v << (64 - k));
}

// unaligned reads, fast and safe on Nehalem and later microarchitectures
inline static uint64_t read_u64(const void * const ptr)
{
    return static_cast<uint64_t>(*reinterpret_cast<const uint64_t*>(ptr));
}

inline static uint64_t read_u32(const void * const ptr)
{
    return static_cast<uint64_t>(*reinterpret_cast<const uint32_t*>(ptr));
}

inline static uint64_t read_u16(const void * const ptr)
{
    return static_cast<uint64_t>(*reinterpret_cast<const uint16_t*>(ptr));
}

inline static uint64_t read_u8 (const void * const ptr)
{
    return static_cast<uint64_t>(*reinterpret_cast<const uint8_t *>(ptr));
}


#endif // #ifndef METROHASH_PLATFORM_H


================================================
FILE: benchmarks/rapidhash.h
================================================
/*
 * rapidhash V3 - Very fast, high quality, platform-independent hashing algorithm.
 *
 * Based on 'wyhash', by Wang Yi <godspeed_china@yeah.net>
 * 
 * Copyright (C) 2025 Nicolas De Carli
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * You can contact the author at:
 *   - rapidhash source repository: https://github.com/Nicoshev/rapidhash
 */

 #pragma once
 
/*
 *  Includes.
 */
 #include <stdint.h>
 #include <string.h>
 #if defined(_MSC_VER)
 # include <intrin.h>
 # if defined(_M_X64) && !defined(_M_ARM64EC)
 #   pragma intrinsic(_umul128)
 # endif
 #endif
 
 /*
  *  C/C++ macros.
  */
 
 #ifdef _MSC_VER
 # define RAPIDHASH_ALWAYS_INLINE __forceinline
 #elif defined(__GNUC__)
 # define RAPIDHASH_ALWAYS_INLINE inline __attribute__((__always_inline__))
 #else
 # define RAPIDHASH_ALWAYS_INLINE inline
 #endif
 
 #ifdef __cplusplus
 # define RAPIDHASH_NOEXCEPT noexcept
 # define RAPIDHASH_CONSTEXPR constexpr
 # ifndef RAPIDHASH_INLINE
 #   define RAPIDHASH_INLINE RAPIDHASH_ALWAYS_INLINE
 # endif
 # if __cplusplus >= 201402L && !defined(_MSC_VER)
 #   define RAPIDHASH_INLINE_CONSTEXPR RAPIDHASH_ALWAYS_INLINE constexpr
 # else
 #   define RAPIDHASH_INLINE_CONSTEXPR RAPIDHASH_ALWAYS_INLINE
 # endif
 #else
 # define RAPIDHASH_NOEXCEPT
 # define RAPIDHASH_CONSTEXPR static const
 # ifndef RAPIDHASH_INLINE
 #   define RAPIDHASH_INLINE static RAPIDHASH_ALWAYS_INLINE
 # endif
 # define RAPIDHASH_INLINE_CONSTEXPR RAPIDHASH_INLINE
 #endif

 /*
  *  Unrolled macro.
  *  Improves large input speed, but increases code size and worsens small input speed.
  *
  *  RAPIDHASH_COMPACT: Normal behavior.
  *  RAPIDHASH_UNROLLED: 
  *
  */
  #ifndef RAPIDHASH_UNROLLED
  # define RAPIDHASH_COMPACT
  #elif defined(RAPIDHASH_COMPACT)
  # error "cannot define RAPIDHASH_COMPACT and RAPIDHASH_UNROLLED simultaneously."
  #endif
 
 /*
  *  Protection macro, alters behaviour of rapid_mum multiplication function.
  *
  *  RAPIDHASH_FAST: Normal behavior, max speed.
  *  RAPIDHASH_PROTECTED: Extra protection against entropy loss.
  */
 #ifndef RAPIDHASH_PROTECTED
 # define RAPIDHASH_FAST
 #elif defined(RAPIDHASH_FAST)
 # error "cannot define RAPIDHASH_PROTECTED and RAPIDHASH_FAST simultaneously."
 #endif
 
 /*
  *  Likely and unlikely macros.
  */
 #if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
 # define _likely_(x)  __builtin_expect(x,1)
 # define _unlikely_(x)  __builtin_expect(x,0)
 #else
 # define _likely_(x) (x)
 # define _unlikely_(x) (x)
 #endif
 
 /*
  *  Endianness macros.
  */
 #ifndef RAPIDHASH_LITTLE_ENDIAN
 # if defined(_WIN32) || defined(__LITTLE_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
 #   define RAPIDHASH_LITTLE_ENDIAN
 # elif defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 #   define RAPIDHASH_BIG_ENDIAN
 # else
 #   warning "could not determine endianness! Falling back to little endian."
 #   define RAPIDHASH_LITTLE_ENDIAN
 # endif
 #endif
 
 /*
  *  Default secret parameters.
  */
   RAPIDHASH_CONSTEXPR uint64_t rapid_secret[8] = {
     0x2d358dccaa6c78a5ull,
     0x8bb84b93962eacc9ull,
     0x4b33a62ed433d4a3ull,
     0x4d5a2da51de1aa47ull,
     0xa0761d6478bd642full,
     0xe7037ed1a0b428dbull,
     0x90ed1765281c388cull,
     0xaaaaaaaaaaaaaaaaull};
 
 /*
  *  64*64 -> 128bit multiply function.
  *
  *  @param A  Address of 64-bit number.
  *  @param B  Address of 64-bit number.
  *
  *  Calculates 128-bit C = *A * *B.
  *
  *  When RAPIDHASH_FAST is defined:
  *  Overwrites A contents with C's low 64 bits.
  *  Overwrites B contents with C's high 64 bits.
  *
  *  When RAPIDHASH_PROTECTED is defined:
  *  Xors and overwrites A contents with C's low 64 bits.
  *  Xors and overwrites B contents with C's high 64 bits.
  */
 RAPIDHASH_INLINE_CONSTEXPR void rapid_mum(uint64_t *A, uint64_t *B) RAPIDHASH_NOEXCEPT {
 #if defined(__SIZEOF_INT128__)
   __uint128_t r=*A; r*=*B;
   #ifdef RAPIDHASH_PROTECTED
   *A^=(uint64_t)r; *B^=(uint64_t)(r>>64);
   #else
   *A=(uint64_t)r; *B=(uint64_t)(r>>64);
   #endif
 #elif defined(_MSC_VER) && (defined(_WIN64) || defined(_M_HYBRID_CHPE_ARM64))
   #if defined(_M_X64)
     #ifdef RAPIDHASH_PROTECTED
     uint64_t a, b;
     a=_umul128(*A,*B,&b);
     *A^=a;  *B^=b;
     #else
     *A=_umul128(*A,*B,B);
     #endif
   #else
     #ifdef RAPIDHASH_PROTECTED
     uint64_t a, b;
     b = __umulh(*A, *B);
     a = *A * *B;
     *A^=a;  *B^=b;
     #else
     uint64_t c = __umulh(*A, *B);
     *A = *A * *B;
     *B = c;
     #endif
   #endif
 #else
   uint64_t ha=*A>>32, hb=*B>>32, la=(uint32_t)*A, lb=(uint32_t)*B;
   uint64_t rh=ha*hb, rm0=ha*lb, rm1=hb*la, rl=la*lb, t=rl+(rm0<<32), c=t<rl;
   uint64_t lo=t+(rm1<<32); 
   c+=lo<t; 
   uint64_t hi=rh+(rm0>>32)+(rm1>>32)+c;
   #ifdef RAPIDHASH_PROTECTED
   *A^=lo;  *B^=hi;
   #else
   *A=lo;  *B=hi;
   #endif
 #endif
 }
 
 /*
  *  Multiply and xor mix function.
  *
  *  @param A  64-bit number.
  *  @param B  64-bit number.
  *
  *  Calculates 128-bit C = A * B.
  *  Returns 64-bit xor between high and low 64 bits of C.
  */
  RAPIDHASH_INLINE_CONSTEXPR uint64_t rapid_mix(uint64_t A, uint64_t B) RAPIDHASH_NOEXCEPT { rapid_mum(&A,&B); return A^B; }
 
 /*
  *  Read functions.
  */
 #ifdef RAPIDHASH_LITTLE_ENDIAN
 RAPIDHASH_INLINE uint64_t rapid_read64(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint64_t v; memcpy(&v, p, sizeof(uint64_t)); return v;}
 RAPIDHASH_INLINE uint64_t rapid_read32(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint32_t v; memcpy(&v, p, sizeof(uint32_t)); return v;}
 #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
 RAPIDHASH_INLINE uint64_t rapid_read64(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint64_t v; memcpy(&v, p, sizeof(uint64_t)); return __builtin_bswap64(v);}
 RAPIDHASH_INLINE uint64_t rapid_read32(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint32_t v; memcpy(&v, p, sizeof(uint32_t)); return __builtin_bswap32(v);}
 #elif defined(_MSC_VER)
 RAPIDHASH_INLINE uint64_t rapid_read64(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint64_t v; memcpy(&v, p, sizeof(uint64_t)); return _byteswap_uint64(v);}
 RAPIDHASH_INLINE uint64_t rapid_read32(const uint8_t *p) RAPIDHASH_NOEXCEPT { uint32_t v; memcpy(&v, p, sizeof(uint32_t)); return _byteswap_ulong(v);}
 #else
 RAPIDHASH_INLINE uint64_t rapid_read64(const uint8_t *p) RAPIDHASH_NOEXCEPT {
   uint64_t v; memcpy(&v, p, 8);
   return (((v >> 56) & 0xff)| ((v >> 40) & 0xff00)| ((v >> 24) & 0xff0000)| ((v >>  8) & 0xff000000)| ((v <<  8) & 0xff00000000)| ((v << 24) & 0xff0000000000)| ((v << 40) & 0xff000000000000)| ((v << 56) & 0xff00000000000000));
 }
 RAPIDHASH_INLINE uint64_t rapid_read32(const uint8_t *p) RAPIDHASH_NOEXCEPT {
   uint32_t v; memcpy(&v, p, 4);
   return (((v >> 24) & 0xff)| ((v >>  8) & 0xff00)| ((v <<  8) & 0xff0000)| ((v << 24) & 0xff000000));
 }
 #endif
 
 /*
  *  rapidhash main function.
  *
  *  @param key     Buffer to be hashed.
  *  @param len     @key length, in bytes.
  *  @param seed    64-bit seed used to alter the hash result predictably.
  *  @param secret  Triplet of 64-bit secrets used to alter hash result predictably.
  *
  *  Returns a 64-bit hash.
  */
RAPIDHASH_INLINE_CONSTEXPR uint64_t rapidhash_internal(const void *key, size_t len, uint64_t seed, const uint64_t* secret) RAPIDHASH_NOEXCEPT {
  const uint8_t *p=(const uint8_t *)key;
  seed ^= rapid_mix(seed ^ secret[2], secret[1]);
  uint64_t a=0, b=0;
  size_t i = len;
  if (_likely_(len <= 16)) {
    if (len >= 4) {
      seed ^= len;
      if (len >= 8) {
        const uint8_t* plast = p + len - 8;
        a = rapid_read64(p);
        b = rapid_read64(plast);
      } else {
        const uint8_t* plast = p + len - 4;
        a = rapid_read32(p);
        b = rapid_read32(plast);
      }
    } else if (len > 0) {
      a = (((uint64_t)p[0])<<45)|p[len-1];
      b = p[len>>1];
    } else
      a = b = 0;
  } else {
    if (len > 112) {
      uint64_t see1 = seed, see2 = seed;
      uint64_t see3 = seed, see4 = seed;
      uint64_t see5 = seed, see6 = seed;
#ifdef RAPIDHASH_COMPACT
      do {
        seed = rapid_mix(rapid_read64(p) ^ secret[0], rapid_read64(p + 8) ^ seed);
        see1 = rapid_mix(rapid_read64(p + 16) ^ secret[1], rapid_read64(p + 24) ^ see1);
        see2 = rapid_mix(rapid_read64(p + 32) ^ secret[2], rapid_read64(p + 40) ^ see2);
        see3 = rapid_mix(rapid_read64(p + 48) ^ secret[3], rapid_read64(p + 56) ^ see3);
        see4 = rapid_mix(rapid_read64(p + 64) ^ secret[4], rapid_read64(p + 72) ^ see4);
        see5 = rapid_mix(rapid_read64(p + 80) ^ secret[5], rapid_read64(p + 88) ^ see5);
        see6 = rapid_mix(rapid_read64(p + 96) ^ secret[6], rapid_read64(p + 104) ^ see6);
        p += 112;
        i -= 112;
      } while(i > 112);
#else
      while (i > 224) {
        seed = rapid_mix(rapid_read64(p) ^ secret[0], rapid_read64(p + 8) ^ seed);
        see1 = rapid_mix(rapid_read64(p + 16) ^ secret[1], rapid_read64(p + 24) ^ see1);
        see2 = rapid_mix(rapid_read64(p + 32) ^ secret[2], rapid_read64(p + 40) ^ see2);
        see3 = rapid_mix(rapid_read64(p + 48) ^ secret[3], rapid_read64(p + 56) ^ see3);
        see4 = rapid_mix(rapid_read64(p + 64) ^ secret[4], rapid_read64(p + 72) ^ see4);
        see5 = rapid_mix(rapid_read64(p + 80) ^ secret[5], rapid_read64(p + 88) ^ see5);
        see6 = rapid_mix(rapid_read64(p + 96) ^ secret[6], rapid_read64(p + 104) ^ see6);
        seed = rapid_mix(rapid_read64(p + 112) ^ secret[0], rapid_read64(p + 120) ^ seed);
        see1 = rapid_mix(rapid_read64(p + 128) ^ secret[1], rapid_read64(p + 136) ^ see1);
        see2 = rapid_mix(rapid_read64(p + 144) ^ secret[2], rapid_read64(p + 152) ^ see2);
        see3 = rapid_mix(rapid_read64(p + 160) ^ secret[3], rapid_read64(p + 168) ^ see3);
        see4 = rapid_mix(rapid_read64(p + 176) ^ secret[4], rapid_read64(p + 184) ^ see4);
        see5 = rapid_mix(rapid_read64(p + 192) ^ secret[5], rapid_read64(p + 200) ^ see5);
        see6 = rapid_mix(rapid_read64(p + 208) ^ secret[6], rapid_read64(p + 216) ^ see6);
        p += 224;
        i -= 224;
      }
      if (i > 112) {
        seed = rapid_mix(rapid_read64(p) ^ secret[0], rapid_read64(p + 8) ^ seed);
        see1 = rapid_mix(rapid_read64(p + 16) ^ secret[1], rapid_read64(p + 24) ^ see1);
        see2 = rapid_mix(rapid_read64(p + 32) ^ secret[2], rapid_read64(p + 40) ^ see2);
        see3 = rapid_mix(rapid_read64(p + 48) ^ secret[3], rapid_read64(p + 56) ^ see3);
        see4 = rapid_mix(rapid_read64(p + 64) ^ secret[4], rapid_read64(p + 72) ^ see4);
        see5 = rapid_mix(rapid_read64(p + 80) ^ secret[5], rapid_read64(p + 88) ^ see5);
        see6 = rapid_mix(rapid_read64(p + 96) ^ secret[6], rapid_read64(p + 104) ^ see6);
        p += 112;
        i -= 112;
      }
#endif
      seed ^= see1;
      see2 ^= see3;
      see4 ^= see5;
      seed ^= see6;
      see2 ^= see4;
      seed ^= see2;
    }
    if (i > 16) {
      seed = rapid_mix(rapid_read64(p) ^ secret[2], rapid_read64(p + 8) ^ seed);
      if (i > 32) {
          seed = rapid_mix(rapid_read64(p + 16) ^ secret[2], rapid_read64(p + 24) ^ seed);
          if (i > 48) {
              seed = rapid_mix(rapid_read64(p + 32) ^ secret[1], rapid_read64(p + 40) ^ seed);
              if (i > 64) {
                  seed = rapid_mix(rapid_read64(p + 48) ^ secret[1], rapid_read64(p + 56) ^ seed);
                  if (i > 80) {
                      seed = rapid_mix(rapid_read64(p + 64) ^ secret[2], rapid_read64(p + 72) ^ seed);
                      if (i > 96) {
                          seed = rapid_mix(rapid_read64(p + 80) ^ secret[1], rapid_read64(p + 88) ^ seed);
                      }
                  }
              }
          }
      }
    }
    a=rapid_read64(p+i-16) ^ i;  b=rapid_read64(p+i-8);
  }
  a ^= secret[1];
  b ^= seed;
  rapid_mum(&a, &b);
  return rapid_mix(a ^ secret[7], b ^ secret[1] ^ i);
}

 /*
  *  rapidhashMicro main function.
  *
  *  @param key     Buffer to be hashed.
  *  @param len     @key length, in bytes.
  *  @param seed    64-bit seed used to alter the hash result predictably.
  *  @param secret  Triplet of 64-bit secrets used to alter hash result predictably.
  *
  *  Returns a 64-bit hash.
  */
  RAPIDHASH_INLINE_CONSTEXPR uint64_t rapidhashMicro_internal(const void *key, size_t len, uint64_t seed, const uint64_t* secret) RAPIDHASH_NOEXCEPT {
    const uint8_t *p=(const uint8_t *)key;
    seed ^= rapid_mix(seed ^ secret[2], secret[1]);
    uint64_t a=0, b=0;
    size_t i = len;
    if (_likely_(len <= 16)) {
      if (len >= 4) {
        seed ^= len;
        if (len >= 8) {
          const uint8_t* plast = p + len - 8;
          a = rapid_read64(p);
          b = rapid_read64(plast);
        } else {
          const uint8_t* plast = p + len - 4;
          a = rapid_read32(p);
          b = rapid_read32(plast);
        }
      } else if (len > 0) {
        a = (((uint64_t)p[0])<<45)|p[len-1];
        b = p[len>>1];
      } else
        a = b = 0;
    } else {
      if (i > 80) {
        uint64_t see1 = seed, see2 = seed;
        uint64_t see3 = seed, see4 = seed;
        do {
          seed = rapid_mix(rapid_read64(p) ^ secret[0], rapid_read64(p + 8) ^ seed);
          see1 = rapid_mix(rapid_read64(p + 16) ^ secret[1], rapid_read64(p + 24) ^ see1);
          see2 = rapid_mix(rapid_read64(p + 32) ^ secret[2], rapid_read64(p + 40) ^ see2);
          see3 = rapid_mix(rapid_read64(p + 48) ^ secret[3], rapid_read64(p + 56) ^ see3);
          see4 = rapid_mix(rapid_read64(p + 64) ^ secret[4], rapid_read64(p + 72) ^ see4);
          p += 80;
          i -= 80;
        } while(i > 80);
        seed ^= see1;
        see2 ^= see3;
        seed ^= see4;
        seed ^= see2;
      }
      if (i > 16) {
        seed = rapid_mix(rapid_read64(p) ^ secret[2], rapid_read64(p + 8) ^ seed);
        if (i > 32) {
            seed = rapid_mix(rapid_read64(p + 16) ^ secret[2], rapid_read64(p + 24) ^ seed);
            if (i > 48) {
                seed = rapid_mix(rapid_read64(p + 32) ^ secret[1], rapid_read64(p + 40) ^ seed);
                if (i > 64) {
                    seed = rapid_mix(rapid_read64(p + 48) ^ secret[1], rapid_read64(p + 56) ^ seed);
                }
            }
        }
      }
      a=rapid_read64(p+i-16) ^ i;  b=rapid_read64(p+i-8);
    }
    a ^= secret[1];
    b ^= seed;
    rapid_mum(&a, &b);
    return rapid_mix(a ^ secret[7], b ^ secret[1] ^ i);
  }

  /*
  *  rapidhashNano main function.
  *
  *  @param key     Buffer to be hashed.
  *  @param len     @key length, in bytes.
  *  @param seed    64-bit seed used to alter the hash result predictably.
  *  @param secret  Triplet of 64-bit secrets used to alter hash result predictably.
  *
  *  Returns a 64-bit hash.
  */
  RAPIDHASH_INLINE_CONSTEXPR uint64_t rapidhashNano_internal(const void *key, size_t len, uint64_t seed, const uint64_t* secret) RAPIDHASH_NOEXCEPT {
    const uint8_t *p=(const uint8_t *)key;
    seed ^= rapid_mix(seed ^ secret[2], secret[1]);
    uint64_t a=0, b=0;
    size_t i = len;
    if (_likely_(len <= 16)) {
      if (len >= 4) {
        seed ^= len;
        if (len >= 8) {
          const uint8_t* plast = p + len - 8;
          a = rapid_read64(p);
          b = rapid_read64(plast);
        } else {
          const uint8_t* plast = p + len - 4;
          a = rapid_read32(p);
          b = rapid_read32(plast);
        }
      } else if (len > 0) {
        a = (((uint64_t)p[0])<<45)|p[len-1];
        b = p[len>>1];
      } else
        a = b = 0;
    } else {
      if (i > 48) {
        uint64_t see1 = seed, see2 = seed;
        do {
          seed = rapid_mix(rapid_read64(p) ^ secret[0], rapid_read64(p + 8) ^ seed);
          see1 = rapid_mix(rapid_read64(p + 16) ^ secret[1], rapid_read64(p + 24) ^ see1);
          see2 = rapid_mix(rapid_read64(p + 32) ^ secret[2], rapid_read64(p + 40) ^ see2);
          p += 48;
          i -= 48;
        } while(i > 48);
        seed ^= see1;
        seed ^= see2;
      }
      if (i > 16) {
        seed = rapid_mix(rapid_read64(p) ^ secret[2], rapid_read64(p + 8) ^ seed);
        if (i > 32) {
            seed = rapid_mix(rapid_read64(p + 16) ^ secret[2], rapid_read64(p + 24) ^ seed);
        }
      }
      a=rapid_read64(p+i-16) ^ i;  b=rapid_read64(p+i-8);
    }
    a ^= secret[1];
    b ^= seed;
    rapid_mum(&a, &b);
    return rapid_mix(a ^ secret[7], b ^ secret[1] ^ i);
  }
 
/*
 *  rapidhash seeded hash function.
 *
 *  @param key     Buffer to be hashed.
 *  @param len     @key length, in bytes.
 *  @param seed    64-bit seed used to alter the hash result predictably.
 *
 *  Calls rapidhash_internal using provided parameters and default secrets.
 *
 *  Returns a 64-bit hash.
 */
RAPIDHASH_INLINE_CONSTEXPR uint64_t rapidhash_withSeed(const void *key, size_t len, uint64_t seed) RAPIDHASH_NOEXCEPT {
  return rapidhash_internal(key, len, seed, rapid_secret);
}
 
/*
 *  rapidhash general purpose hash function.
 *
 *  @param key     Buffer to be hashed.
 *  @param len     @key length, in bytes.
 *
 *  Calls rapidhash_withSeed using provided parameters and the default seed.
 *
 *  Returns a 64-bit hash.
 */
RAPIDHASH_INLINE_CONSTEXPR uint64_t rapidhash(const void *key, size_t len) RAPIDHASH_NOEXCEPT {
  return rapidhash_withSeed(key, len, 0);
}

/*
 *  rapidhashMicro seeded hash function.
 *
 *  Designed for HPC and server applications, where cache misses make a noticeable performance detriment.
 *  Clang-18+ compiles it to ~140 instructions without stack usage, both on x86-64 and aarch64.
 *  Faster for sizes up to 512 bytes, just 15%-20% slower for inputs above 1kb.
 *
 *  @param key     Buffer to be hashed.
 *  @param len     @key length, in bytes.
 *  @param seed    64-bit seed used to alter the hash result predictably.
 *
 *  Calls rapidhash_internal using provided parameters and default secrets.
 *
 *  Returns a 64-bit hash.
 */
 RAPIDHASH_INLINE_CONSTEXPR uint64_t rapidhashMicro_withSeed(const void *key, size_t len, uint64_t seed) RAPIDHASH_NOEXCEPT {
  return rapidhashMicro_internal(key, len, seed, rapid_secret);
}
 
/*
 *  rapidhashMicro hash function.
 *
 *  @param key     Buffer to be hashed.
 *  @param len     @key length, in bytes.
 *
 *  Calls rapidhash_withSeed using provided parameters and the default seed.
 *
 *  Returns a 64-bit hash.
 */
RAPIDHASH_INLINE_CONSTEXPR uint64_t rapidhashMicro(const void *key, size_t len) RAPIDHASH_NOEXCEPT {
  return rapidhashMicro_withSeed(key, len, 0);
}

/*
 *  rapidhashNano seeded hash function.
 *
 *  @param key     Buffer to be hashed.
 *  @param len     @key length, in bytes.
 *  @param seed    64-bit seed used to alter the hash result predictably.
 *
 *  Calls rapidhash_internal using provided parameters and default secrets.
 *
 *  Returns a 64-bit hash.
 */
 RAPIDHASH_INLINE_CONSTEXPR uint64_t rapidhashNano_withSeed(const void *key, size_t len, uint64_t seed) RAPIDHASH_NOEXCEPT {
  return rapidhashNano_internal(key, len, seed, rapid_secret);
}
 
/*
 *  rapidhashNano hash function.
 *
 *  Designed for Mobile and embedded applications, where keeping a small code size is a top priority.
 *  Clang-18+ compiles it to less than 100 instructions without stack usage, both on x86-64 and aarch64.
 *  The fastest for sizes up to 48 bytes, but may be considerably slower for larger inputs.
 *
 *  @param key     Buffer to be hashed.
 *  @param len     @key length, in bytes.
 *
 *  Calls rapidhash_withSeed using provided parameters and the default seed.
 *
 *  Returns a 64-bit hash.
 */
RAPIDHASH_INLINE_CONSTEXPR uint64_t rapidhashNano(const void *key, size_t len) RAPIDHASH_NOEXCEPT {
  return rapidhashNano_withSeed(key, len, 0);
}


================================================
FILE: benchmarks/sha3.c
================================================
/* sha3.c - an implementation of Secure Hash Algorithm 3 (Keccak).
 * based on the
 * The Keccak SHA-3 submission. Submission to NIST (Round 3), 2011
 * by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche
 *
 * Copyright: 2013 Aleksey Kravchenko <rhash.admin@gmail.com>
 *
 * Permission is hereby granted,  free of charge,  to any person  obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction,  including without limitation
 * the rights to  use, copy, modify,  merge, publish, distribute, sublicense,
 * and/or sell copies  of  the Software,  and to permit  persons  to whom the
 * Software is furnished to do so.
 *
 * This program  is  distributed  in  the  hope  that it will be useful,  but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  Use this program  at  your own risk!
 */

#include <assert.h>
#include <string.h>
#include "byte_order.h"
#include "sha3.h"

/* constants */
#define NumberOfRounds 24

/* SHA3 (Keccak) constants for 24 rounds */
static uint64_t keccak_round_constants[NumberOfRounds] = {
	I64(0x0000000000000001), I64(0x0000000000008082), I64(0x800000000000808A), I64(0x8000000080008000),
	I64(0x000000000000808B), I64(0x0000000080000001), I64(0x8000000080008081), I64(0x8000000000008009),
	I64(0x000000000000008A), I64(0x0000000000000088), I64(0x0000000080008009), I64(0x000000008000000A),
	I64(0x000000008000808B), I64(0x800000000000008B), I64(0x8000000000008089), I64(0x8000000000008003),
	I64(0x8000000000008002), I64(0x8000000000000080), I64(0x000000000000800A), I64(0x800000008000000A),
	I64(0x8000000080008081), I64(0x8000000000008080), I64(0x0000000080000001), I64(0x8000000080008008)
};

/* Initializing a sha3 context for given number of output bits */
static void rhash_keccak_init(sha3_ctx *ctx, unsigned bits)
{
	/* NB: The Keccak capacity parameter = bits * 2 */
	unsigned rate = 1600 - bits * 2;

	memset(ctx, 0, sizeof(sha3_ctx));
	ctx->block_size = rate / 8;
	assert(rate <= 1600 && (rate % 64) == 0);
}

/**
 * Initialize context before calculating hash.
 *
 * @param ctx context to initialize
 */
void rhash_sha3_224_init(sha3_ctx *ctx)
{
	rhash_keccak_init(ctx, 224);
}

/**
 * Initialize context before calculating hash.
 *
 * @param ctx context to initialize
 */
void rhash_sha3_256_init(sha3_ctx *ctx)
{
	rhash_keccak_init(ctx, 256);
}

/**
 * Initialize context before calculating hash.
 *
 * @param ctx context to initialize
 */
void rhash_sha3_384_init(sha3_ctx *ctx)
{
	rhash_keccak_init(ctx, 384);
}

/**
 * Initialize context before calculating hash.
 *
 * @param ctx context to initialize
 */
void rhash_sha3_512_init(sha3_ctx *ctx)
{
	rhash_keccak_init(ctx, 512);
}

/* Keccak theta() transformation */
static void keccak_theta(uint64_t *A)
{
	unsigned int x;
	uint64_t C[5], D[5];

	for (x = 0; x < 5; x++) {
		C[x] = A[x] ^ A[x + 5] ^ A[x + 10] ^ A[x + 15] ^ A[x + 20];
	}
	D[0] = ROTL64(C[1], 1) ^ C[4];
	D[1] = ROTL64(C[2], 1) ^ C[0];
	D[2] = ROTL64(C[3], 1) ^ C[1];
	D[3] = ROTL64(C[4], 1) ^ C[2];
	D[4] = ROTL64(C[0], 1) ^ C[3];

	for (x = 0; x < 5; x++) {
		A[x]      ^= D[x];
		A[x + 5]  ^= D[x];
		A[x + 10] ^= D[x];
		A[x + 15] ^= D[x];
		A[x + 20] ^= D[x];
	}
}

/* Keccak pi() transformation */
static void keccak_pi(uint64_t *A)
{
	uint64_t A1;
	A1 = A[1];
	A[ 1] = A[ 6];
	A[ 6] = A[ 9];
	A[ 9] = A[22];
	A[22] = A[14];
	A[14] = A[20];
	A[20] = A[ 2];
	A[ 2] = A[12];
	A[12] = A[13];
	A[13] = A[19];
	A[19] = A[23];
	A[23] = A[15];
	A[15] = A[ 4];
	A[ 4] = A[24];
	A[24] = A[21];
	A[21] = A[ 8];
	A[ 8] = A[16];
	A[16] = A[ 5];
	A[ 5] = A[ 3];
	A[ 3] = A[18];
	A[18] = A[17];
	A[17] = A[11];
	A[11] = A[ 7];
	A[ 7] = A[10];
	A[10] = A1;
	/* note: A[ 0] is left as is */
}

/* Keccak chi() transformation */
static void keccak_chi(uint64_t *A)
{
	int i;
	for (i = 0; i < 25; i += 5) {
		uint64_t A0 = A[0 + i], A1 = A[1 + i];
		A[0 + i] ^= ~A1 & A[2 + i];
		A[1 + i] ^= ~A[2 + i] & A[3 + i];
		A[2 + i] ^= ~A[3 + i] & A[4 + i];
		A[3 + i] ^= ~A[4 + i] & A0;
		A[4 + i] ^= ~A0 & A1;
	}
}

static void rhash_sha3_permutation(uint64_t *state)
{
	int round;
	for (round = 0; round < NumberOfRounds; round++)
	{
		keccak_theta(state);

		/* apply Keccak rho() transformation */
		state[ 1] = ROTL64(state[ 1],  1);
		state[ 2] = ROTL64(state[ 2], 62);
		state[ 3] = ROTL64(state[ 3], 28);
		state[ 4] = ROTL64(state[ 4], 27);
		state[ 5] = ROTL64(state[ 5], 36);
		state[ 6] = ROTL64(state[ 6], 44);
		state[ 7] = ROTL64(state[ 7],  6);
		state[ 8] = ROTL64(state[ 8], 55);
		state[ 9] = ROTL64(state[ 9], 20);
		state[10] = ROTL64(state[10],  3);
		state[11] = ROTL64(state[11], 10);
		state[12] = ROTL64(state[12], 43);
		state[13] = ROTL64(state[13], 25);
		state[14] = ROTL64(state[14], 39);
		state[15] = ROTL64(state[15], 41);
		state[16] = ROTL64(state[16], 45);
		state[17] = ROTL64(state[17], 15);
		state[18] = ROTL64(state[18], 21);
		state[19] = ROTL64(state[19],  8);
		state[20] = ROTL64(state[20], 18);
		state[21] = ROTL64(state[21],  2);
		state[22] = ROTL64(state[22], 61);
		state[23] = ROTL64(state[23], 56);
		state[24] = ROTL64(state[24], 14);

		keccak_pi(state);
		keccak_chi(state);

		/* apply iota(state, round) */
		*state ^= keccak_round_constants[round];
	}
}

/**
 * The core transformation. Process the specified block of data.
 *
 * @param hash the algorithm state
 * @param block the message block to process
 * @param block_size the size of the processed block in bytes
 */
static void rhash_sha3_process_block(uint64_t hash[25], const uint64_t *block, size_t block_size)
{
	/* expanded loop */
	hash[ 0] ^= le2me_64(block[ 0]);
	hash[ 1] ^= le2me_64(block[ 1]);
	hash[ 2] ^= le2me_64(block[ 2]);
	hash[ 3] ^= le2me_64(block[ 3]);
	hash[ 4] ^= le2me_64(block[ 4]);
	hash[ 5] ^= le2me_64(block[ 5]);
	hash[ 6] ^= le2me_64(block[ 6]);
	hash[ 7] ^= le2me_64(block[ 7]);
	hash[ 8] ^= le2me_64(block[ 8]);
	/* if not sha3-512 */
	if (block_size > 72) {
		hash[ 9] ^= le2me_64(block[ 9]);
		hash[10] ^= le2me_64(block[10]);
		hash[11] ^= le2me_64(block[11]);
		hash[12] ^= le2me_64(block[12]);
		/* if not sha3-384 */
		if (block_size > 104) {
			hash[13] ^= le2me_64(block[13]);
			hash[14] ^= le2me_64(block[14]);
			hash[15] ^= le2me_64(block[15]);
			hash[16] ^= le2me_64(block[16]);
			/* if not sha3-256 */
			if (block_size > 136) {
				hash[17] ^= le2me_64(block[17]);
#ifdef FULL_SHA3_FAMILY_SUPPORT
				/* if not sha3-224 */
				if (block_size > 144) {
					hash[18] ^= le2me_64(block[18]);
					hash[19] ^= le2me_64(block[19]);
					hash[20] ^= le2me_64(block[20]);
					hash[21] ^= le2me_64(block[21]);
					hash[22] ^= le2me_64(block[22]);
					hash[23] ^= le2me_64(block[23]);
					hash[24] ^= le2me_64(block[24]);
				}
#endif
			}
		}
	}
	/* make a permutation of the hash */
	rhash_sha3_permutation(hash);
}

#define SHA3_FINALIZED 0x80000000

/**
 * Calculate message hash.
 * Can be called repeatedly with chunks of the message to be hashed.
 *
 * @param ctx the algorithm context containing current hashing state
 * @param msg message chunk
 * @param size length of the message chunk
 */
void rhash_sha3_update(sha3_ctx *ctx, const unsigned char *msg, size_t size)
{
	size_t index = (size_t)ctx->rest;
	size_t block_size = (size_t)ctx->block_size;

	if (ctx->rest & SHA3_FINALIZED) return; /* too late for additional input */
	ctx->rest = (unsigned)((ctx->rest + size) % block_size);

	/* fill partial block */
	if (index) {
		size_t left = block_size - index;
		memcpy((char*)ctx->message + index, msg, (size < left ? size : left));
		if (size < left) return;

		/* process partial block */
		rhash_sha3_process_block(ctx->hash, ctx->message, block_size);
		msg  += left;
		size -= left;
	}
	while (size >= block_size) {
		uint64_t* aligned_message_block;
		if (IS_ALIGNED_64(msg)) {
			/* the most common case is processing of an already aligned message
			without copying it */
			aligned_message_block = (uint64_t*)msg;
		} else {
			memcpy(ctx->message, msg, block_size);
			aligned_message_block = ctx->message;
		}

		rhash_sha3_process_block(ctx->hash, aligned_message_block, block_size);
		msg  += block_size;
		size -= block_size;
	}
	if (size) {
		memcpy(ctx->message, msg, size); /* save leftovers */
	}
}

/**
 * Store calculated hash into the given array.
 *
 * @param ctx the algorithm context containing current hashing state
 * @param result calculated hash in binary form
 */
void rhash_sha3_final(sha3_ctx *ctx, unsigned char* result)
{
	size_t digest_length = 100 - ctx->block_size / 2;
	const size_t block_size = ctx->block_size;

	if (!(ctx->rest & SHA3_FINALIZED))
	{
		/* clear the rest of the data queue */
		memset((char*)ctx->message + ctx->rest, 0, block_size - ctx->rest);
		((char*)ctx->message)[ctx->rest] |= 0x06;
		((char*)ctx->message)[block_size - 1] |= 0x80;

		/* process final block */
		rhash_sha3_process_block(ctx->hash, ctx->message, block_size);
		ctx->rest = SHA3_FINALIZED; /* mark context as finalized */
	}

	assert(block_size > digest_length);
	if (result) me64_to_le_str(result, ctx->hash, digest_length);
}

#ifdef USE_KECCAK
/**
* Store calculated hash into the given array.
*
* @param ctx the algorithm context containing current hashing state
* @param result calculated hash in binary form
*/
void rhash_keccak_final(sha3_ctx *ctx, unsigned char* result)
{
	size_t digest_length = 100 - ctx->block_size / 2;
	const size_t block_size = ctx->block_size;

	if (!(ctx->rest & SHA3_FINALIZED))
	{
		/* clear the rest of the data queue */
		memset((char*)ctx->message + ctx->rest, 0, block_size - ctx->rest);
		((char*)ctx->message)[ctx->rest] |= 0x01;
		((char*)ctx->message)[block_size - 1] |= 0x80;

		/* process final block */
		rhash_sha3_process_block(ctx->hash, ctx->message, block_size);
		ctx->rest = SHA3_FINALIZED; /* mark context as finalized */
	}

	assert(block_size > digest_length);
	if (result) me64_to_le_str(result, ctx->hash, digest_length);
}
#endif /* USE_KECCAK */


================================================
FILE: benchmarks/sha3.h
================================================
/* sha3.h */
#ifndef RHASH_SHA3_H
#define RHASH_SHA3_H
#include "ustd.h"

#ifdef __cplusplus
extern "C" {
#endif

#define sha3_224_hash_size  28
#define sha3_256_hash_size  32
#define sha3_384_hash_size  48
#define sha3_512_hash_size  64
#define sha3_max_permutation_size 25
#define sha3_max_rate_in_qwords 24

/**
 * SHA3 Algorithm context.
 */
typedef struct sha3_ctx
{
	/* 1600 bits algorithm hashing state */
	uint64_t hash[sha3_max_permutation_size];
	/* 1536-bit buffer for leftovers */
	uint64_t message[sha3_max_rate_in_qwords];
	/* count of bytes in the message[] buffer */
	unsigned rest;
	/* size of a message block processed at once */
	unsigned block_size;
} sha3_ctx;

/* methods for calculating the hash function */

void rhash_sha3_224_init(sha3_ctx *ctx);
void rhash_sha3_256_init(sha3_ctx *ctx);
void rhash_sha3_384_init(sha3_ctx *ctx);
void rhash_sha3_512_init(sha3_ctx *ctx);
void rhash_sha3_update(sha3_ctx *ctx, const unsigned char* msg, size_t size);
void rhash_sha3_final(sha3_ctx *ctx, unsigned char* result);

#ifdef USE_KECCAK
#define rhash_keccak_224_init rhash_sha3_224_init
#define rhash_keccak_256_init rhash_sha3_256_init
#define rhash_keccak_384_init rhash_sha3_384_init
#define rhash_keccak_512_init rhash_sha3_512_init
#define rhash_keccak_update rhash_sha3_update
void rhash_keccak_final(sha3_ctx *ctx, unsigned char* result);
#endif

#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */

#endif /* RHASH_SHA3_H */


================================================
FILE: benchmarks/sha512.c
================================================
/* sha512.c - an implementation of SHA-384/512 hash functions
 * based on FIPS 180-3 (Federal Information Processing Standart).
 *
 * Copyright: 2010-2012 Aleksey Kravchenko <rhash.admin@gmail.com>
 *
 * Permission is hereby granted,  free of charge,  to any person  obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction,  including without limitation
 * the rights to  use, copy, modify,  merge, publish, distribute, sublicense,
 * and/or sell copies  of  the Software,  and to permit  persons  to whom the
 * Software is furnished to do so.
 *
 * This program  is  distributed  in  the  hope  that it will be useful,  but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  Use this program  at  your own risk!
 */

#include <string.h>
#include "sha512.h"
#include "byte_order.h"

/* SHA-384 and SHA-512 constants for 80 rounds. These qwords represent
 * the first 64 bits of the fractional parts of the cube
 * roots of the first 80 prime numbers. */
static const uint64_t rhash_k512[80] = {
	I64(0x428a2f98d728ae22), I64(0x7137449123ef65cd), I64(0xb5c0fbcfec4d3b2f),
	I64(0xe9b5dba58189dbbc), I64(0x3956c25bf348b538), I64(0x59f111f1b605d019),
	I64(0x923f82a4af194f9b), I64(0xab1c5ed5da6d8118), I64(0xd807aa98a3030242),
	I64(0x12835b0145706fbe), I64(0x243185be4ee4b28c), I64(0x550c7dc3d5ffb4e2),
	I64(0x72be5d74f27b896f), I64(0x80deb1fe3b1696b1), I64(0x9bdc06a725c71235),
	I64(0xc19bf174cf692694), I64(0xe49b69c19ef14ad2), I64(0xefbe4786384f25e3),
	I64(0x0fc19dc68b8cd5b5), I64(0x240ca1cc77ac9c65), I64(0x2de92c6f592b0275),
	I64(0x4a7484aa6ea6e483), I64(0x5cb0a9dcbd41fbd4), I64(0x76f988da831153b5),
	I64(0x983e5152ee66dfab), I64(0xa831c66d2db43210), I64(0xb00327c898fb213f),
	I64(0xbf597fc7beef0ee4), I64(0xc6e00bf33da88fc2), I64(0xd5a79147930aa725),
	I64(0x06ca6351e003826f), I64(0x142929670a0e6e70), I64(0x27b70a8546d22ffc),
	I64(0x2e1b21385c26c926), I64(0x4d2c6dfc5ac42aed), I64(0x53380d139d95b3df),
	I64(0x650a73548baf63de), I64(0x766a0abb3c77b2a8), I64(0x81c2c92e47edaee6),
	I64(0x92722c851482353b), I64(0xa2bfe8a14cf10364), I64(0xa81a664bbc423001),
	I64(0xc24b8b70d0f89791), I64(0xc76c51a30654be30), I64(0xd192e819d6ef5218),
	I64(0xd69906245565a910), I64(0xf40e35855771202a), I64(0x106aa07032bbd1b8),
	I64(0x19a4c116b8d2d0c8), I64(0x1e376c085141ab53), I64(0x2748774cdf8eeb99),
	I64(0x34b0bcb5e19b48a8), I64(0x391c0cb3c5c95a63), I64(0x4ed8aa4ae3418acb),
	I64(0x5b9cca4f7763e373), I64(0x682e6ff3d6b2b8a3), I64(0x748f82ee5defb2fc),
	I64(0x78a5636f43172f60), I64(0x84c87814a1f0ab72), I64(0x8cc702081a6439ec),
	I64(0x90befffa23631e28), I64(0xa4506cebde82bde9), I64(0xbef9a3f7b2c67915),
	I64(0xc67178f2e372532b), I64(0xca273eceea26619c), I64(0xd186b8c721c0c207),
	I64(0xeada7dd6cde0eb1e), I64(0xf57d4f7fee6ed178), I64(0x06f067aa72176fba),
	I64(0x0a637dc5a2c898a6), I64(0x113f9804bef90dae), I64(0x1b710b35131c471b),
	I64(0x28db77f523047d84), I64(0x32caab7b40c72493), I64(0x3c9ebe0a15c9bebc),
	I64(0x431d67c49c100d4c), I64(0x4cc5d4becb3e42b6), I64(0x597f299cfc657e2a),
	I64(0x5fcb6fab3ad6faec), I64(0x6c44198c4a475817)
};

/* The SHA512/384 functions defined by FIPS 180-3, 4.1.3 */
/* Optimized version of Ch(x,y,z)=((x & y) | (~x & z)) */
#define Ch(x,y,z)  ((z) ^ ((x) & ((y) ^ (z))))
/* Optimized version of Maj(x,y,z)=((x & y) ^ (x & z) ^ (y & z)) */
#define Maj(x,y,z) (((x) & (y)) ^ ((z) & ((x) ^ (y))))

#define Sigma0(x) (ROTR64((x), 28) ^ ROTR64((x), 34) ^ ROTR64((x), 39))
#define Sigma1(x) (ROTR64((x), 14) ^ ROTR64((x), 18) ^ ROTR64((x), 41))
#define sigma0(x) (ROTR64((x),  1) ^ ROTR64((x),  8) ^ ((x) >> 7))
#define sigma1(x) (ROTR64((x), 19) ^ ROTR64((x), 61) ^ ((x) >> 6))

/* Recalculate element n-th of circular buffer W using formula
 *   W[n] = sigma1(W[n - 2]) + W[n - 7] + sigma0(W[n - 15]) + W[n - 16]; */
#define RECALCULATE_W(W,n) (W[n] += \
	(sigma1(W[(n - 2) & 15]) + W[(n - 7) & 15] + sigma0(W[(n - 15) & 15])))

#define ROUND(a,b,c,d,e,f,g,h,k,data) { \
	uint64_t T1 = h + Sigma1(e) + Ch(e,f,g) + k + (data); \
	d += T1, h = T1 + Sigma0(a) + Maj(a,b,c); }
#define ROUND_1_16(a,b,c,d,e,f,g,h,n) \
	ROUND(a,b,c,d,e,f,g,h, rhash_k512[n], W[n] = be2me_64(block[n]))
#define ROUND_17_80(a,b,c,d,e,f,g,h,n) \
	ROUND(a,b,c,d,e,f,g,h, k[n], RECALCULATE_W(W, n))

/**
 * Initialize context before calculating hash.
 *
 * @param ctx context to initialize
 */
void rhash_sha512_init(sha512_ctx *ctx)
{
	/* Initial values. These words were obtained by taking the first 32
	 * bits of the fractional parts of the square roots of the first
	 * eight prime numbers. */
	static const uint64_t SHA512_H0[8] = {
		I64(0x6a09e667f3bcc908), I64(0xbb67ae8584caa73b), I64(0x3c6ef372fe94f82b),
		I64(0xa54ff53a5f1d36f1), I64(0x510e527fade682d1), I64(0x9b05688c2b3e6c1f),
		I64(0x1f83d9abfb41bd6b), I64(0x5be0cd19137e2179)
	};

	ctx->length = 0;
	ctx->digest_length = sha512_hash_size;

	/* initialize algorithm state */
	memcpy(ctx->hash, SHA512_H0, sizeof(ctx->hash));
}

/**
 * Initialize context before calculaing hash.
 *
 * @param ctx context to initialize
 */
void rhash_sha384_init(struct sha512_ctx *ctx)
{
	/* Initial values from FIPS 180-3. These words were obtained by taking
	 * the first sixty-four bits of the fractional parts of the square
	 * roots of ninth through sixteenth prime numbers. */
	static const uint64_t SHA384_H0[8] = {
		I64(0xcbbb9d5dc1059ed8), I64(0x629a292a367cd507), I64(0x9159015a3070dd17),
		I64(0x152fecd8f70e5939), I64(0x67332667ffc00b31), I64(0x8eb44a8768581511),
		I64(0xdb0c2e0d64f98fa7), I64(0x47b5481dbefa4fa4)
	};

	ctx->length = 0;
	ctx->digest_length = sha384_hash_size;

	memcpy(ctx->hash, SHA384_H0, sizeof(ctx->hash));
}

/**
 * The core transformation. Process a 512-bit block.
 *
 * @param hash algorithm state
 * @param block the message block to process
 */
static void rhash_sha512_process_block(uint64_t hash[8], uint64_t block[16])
{
	uint64_t A, B, C, D, E, F, G, H;
	uint64_t W[16];
	const uint64_t *k;
	int i;

	A = hash[0], B = hash[1], C = hash[2], D = hash[3];
	E = hash[4], F = hash[5], G = hash[6], H = hash[7];

	/* Compute SHA using alternate Method: FIPS 180-3 6.1.3 */
	ROUND_1_16(A, B, C, D, E, F, G, H, 0);
	ROUND_1_16(H, A, B, C, D, E, F, G, 1);
	ROUND_1_16(G, H, A, B, C, D, E, F, 2);
	ROUND_1_16(F, G, H, A, B, C, D, E, 3);
	ROUND_1_16(E, F, G, H, A, B, C, D, 4);
	ROUND_1_16(D, E, F, G, H, A, B, C, 5);
	ROUND_1_16(C, D, E, F, G, H, A, B, 6);
	ROUND_1_16(B, C, D, E, F, G, H, A, 7);
	ROUND_1_16(A, B, C, D, E, F, G, H, 8);
	ROUND_1_16(H, A, B, C, D, E, F, G, 9);
	ROUND_1_16(G, H, A, B, C, D, E, F, 10);
	ROUND_1_16(F, G, H, A, B, C, D, E, 11);
	ROUND_1_16(E, F, G, H, A, B, C, D, 12);
	ROUND_1_16(D, E, F, G, H, A, B, C, 13);
	ROUND_1_16(C, D, E, F, G, H, A, B, 14);
	ROUND_1_16(B, C, D, E, F, G, H, A, 15);

	for (i = 16, k = &rhash_k512[16]; i < 80; i += 16, k += 16) {
		ROUND_17_80(A, B, C, D, E, F, G, H,  0);
		ROUND_17_80(H, A, B, C, D, E, F, G,  1);
		ROUND_17_80(G, H, A, B, C, D, E, F,  2);
		ROUND_17_80(F, G, H, A, B, C, D, E,  3);
		ROUND_17_80(E, F, G, H, A, B, C, D,  4);
		ROUND_17_80(D, E, F, G, H, A, B, C,  5);
		ROUND_17_80(C, D, E, F, G, H, A, B,  6);
		ROUND_17_80(B, C, D, E, F, G, H, A,  7);
		ROUND_17_80(A, B, C, D, E, F, G, H,  8);
		ROUND_17_80(H, A, B, C, D, E, F, G,  9);
		ROUND_17_80(G, H, A, B, C, D, E, F, 10);
		ROUND_17_80(F, G, H, A, B, C, D, E, 11);
		ROUND_17_80(E, F, G, H, A, B, C, D, 12);
		ROUND_17_80(D, E, F, G, H, A, B, C, 13);
		ROUND_17_80(C, D, E, F, G, H, A, B, 14);
		ROUND_17_80(B, C, D, E, F, G, H, A, 15);
	}

	hash[0] += A, hash[1] += B, hash[2] += C, hash[3] += D;
	hash[4] += E, hash[5] += F, hash[6] += G, hash[7] += H;
}

/**
 * Calculate message hash.
 * Can be called repeatedly with chunks of the message to be hashed.
 *
 * @param ctx the algorithm context containing current hashing state
 * @param msg message chunk
 * @param size length of the message chunk
 */
void rhash_sha512_update(sha512_ctx *ctx, const unsigned char *msg, size_t size)
{
	size_t index = (size_t)ctx->length & 127;
	ctx->length += size;

	/* fill partial block */
	if (index) {
		size_t left = sha512_block_size - index;
		memcpy((char*)ctx->message + index, msg, (size < left ? size : left));
		if (size < left) return;

		/* process partial block */
		rhash_sha512_process_block(ctx->hash, ctx->message);
		msg  += left;
		size -= left;
	}
	while (size >= sha512_block_size) {
		uint64_t* aligned_message_block;
		if (IS_ALIGNED_64(msg)) {
			/* the most common case is processing of an already aligned message
			without copying it */
			aligned_message_block = (uint64_t*)msg;
		} else {
			memcpy(ctx->message, msg, sha512_block_size);
			aligned_message_block = ctx->message;
		}

		rhash_sha512_process_block(ctx->hash, aligned_message_block);
		msg  += sha512_block_size;
		size -= sha512_block_size;
	}
	if (size) {
		memcpy(ctx->message, msg, size); /* save leftovers */
	}
}

/**
 * Store calculated hash into the given array.
 *
 * @param ctx the algorithm context containing current hashing state
 * @param result calculated hash in binary form
 */
void rhash_sha512_final(sha512_ctx *ctx, unsigned char* result)
{
	size_t index = ((unsigned)ctx->length & 127) >> 3;
	unsigned shift = ((unsigned)ctx->length & 7) * 8;

	/* pad message and process the last block */

	/* append the byte 0x80 to the message */
	ctx->message[index]   &= le2me_64( ~(I64(0xFFFFFFFFFFFFFFFF) << shift) );
	ctx->message[index++] ^= le2me_64( I64(0x80) << shift );

	/* if no room left in the message to store 128-bit message length */
	if (index >= 15) {
		if (index == 15) ctx->message[index] = 0;
		rhash_sha512_process_block(ctx->hash, ctx->message);
		index = 0;
	}
	while (index < 15) {
		ctx->message[index++] = 0;
	}
	ctx->message[15] = be2me_64(ctx->length << 3);
	rhash_sha512_process_block(ctx->hash, ctx->message);

	if (result) be64_copy(result, 0, ctx->hash, ctx->digest_length);
}


================================================
FILE: benchmarks/sha512.h
================================================
/* sha.h sha512 and sha384 hash functions */
#ifndef SHA512_H
#define SHA512_H

#if _MSC_VER >= 1300

# define int64_t __int64
# define int32_t __int32
# define int16_t __int16
# define int8_t  __int8
# define uint64_t unsigned __int64
# define uint32_t unsigned __int32
# define uint16_t unsigned __int16
# define uint8_t  unsigned __int8

/* disable warnings: The POSIX name for this item is deprecated. Use the ISO C++ conformant name. */
#pragma warning(disable : 4996)

#else /* _MSC_VER >= 1300 */

# include <stdint.h>
# include <unistd.h>

#endif /* _MSC_VER >= 1300 */

#ifdef __cplusplus
extern "C" {
#endif

#define sha512_block_size 128
#define sha512_hash_size  64
#define sha384_hash_size  48

/* algorithm context */
typedef struct sha512_ctx
{
	uint64_t message[16];   /* 1024-bit buffer for leftovers */
	uint64_t length;        /* number of processed bytes */
	uint64_t hash[8];       /* 512-bit algorithm internal hashing state */
	unsigned digest_length; /* length of the algorithm digest in bytes */
} sha512_ctx;

void rhash_sha384_init(sha512_ctx *ctx);
void rhash_sha512_init(sha512_ctx *ctx);
void rhash_sha512_update(sha512_ctx *ctx, const unsigned char* data, size_t length);
void rhash_sha512_final(sha512_ctx *ctx, unsigned char* result);

#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */

#endif /* SHA512_H */


================================================
FILE: benchmarks/sip24-prng.h
================================================
/* Copyright (c) 2016 Vladimir Makarov <vmakarov@gcc.gnu.org>

   Permission is hereby granted, free of charge, to any person
   obtaining a copy of this software and associated documentation
   files (the "Software"), to deal in the Software without
   restriction, including without limitation the rights to use, copy,
   modify, merge, publish, distribute, sublicense, and/or sell copies
   of the Software, and to permit persons to whom the Software is
   furnished to do so, subject to the following conditions:

   The above copyright notice and this permission notice shall be
   included in all copies or substantial portions of the Software.

   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   SOFTWARE.
*/

/* Pseudo Random Number Generator (PRNG) based on SipHash24 designed
   by J.P. Aumasson and D.J. Bernstein.  The SipHash24 reference code
   (please see https://github.com/veorq/SipHash) adapted for our
   purposes.

   To use a generator call `init_sip24_prng` or
   `init_sip24_prng_with_seed` first, then call `get_sip24_prn` as much
   as you want to get a new PRN.  At the end of the PRNG use, call
   `finish_sip24_prng`.  You can change the default seed by calling
   `set_sip24_prng_seed`.

   The PRNG passes NIST Statistical Test Suite for Random and
   Pseudorandom Number Generators for Cryptographic Applications
   (version 2.2.1) with 1000 bitstreams each containing 1M bits.

   The generation of a new number takes about 11 CPU cycles on x86_64
   (Intel 4.2GHz i7-4790K), or speed of the generation is about 380M
   numbers per sec.  So it is pretty fast.  */

#ifndef __SIP24_PRNG__
#define __SIP24_PRNG__

#ifdef _MSC_VER
typedef unsigned __int32 uint32_t;
typedef unsigned __int64 uint64_t;
#else
#include <stdint.h>
#endif

#include <stdlib.h>

static inline uint64_t
_sip24_prng_rotl (uint64_t v, int c) {
  return (v << c) | (v >> (64 - c));
}

/* A Sip round */
static inline void
_sip24_prng_round (uint64_t *v0, uint64_t *v1, uint64_t *v2, uint64_t *v3) {
    *v0 += *v1; *v1 = _sip24_prng_rotl (*v1, 13);
    *v1 ^= *v0; *v0 = _sip24_prng_rotl (*v0, 32);
    *v2 += *v3; *v3 = _sip24_prng_rotl (*v3, 16);
    *v3 ^= *v2;
    *v0 += *v3; *v3 = _sip24_prng_rotl (*v3, 21);
    *v3 ^= *v0;
    *v2 += *v1; *v1 = _sip24_prng_rotl (*v1, 17);
    *v1 ^= *v2; *v2 = _sip24_prng_rotl (*v2, 32);
}

/* The number of intermediate and final Sip rounds.  */
#define cROUNDS 2
#define dROUNDS 4

/* Internal state of the PRNG.  */
static struct {
  int ind; /* position in the output */
  uint64_t v; /* current message from which PRNs were generated */
  /* The current PRNG state and parts of the recenty generated
     numbers.  */
  uint64_t init_state[4], state[4];
} _sip24_prng_state;

static inline void
_sip24_prng_gen (void)
{
  int i;

  for (i = 0; i < 4; i++)
    _sip24_prng_state.state[i] = _sip24_prng_state.init_state[i];
  _sip24_prng_state.state[3] ^= _sip24_prng_state.v;
  for (i = 0; i < cROUNDS; ++i)
    _sip24_prng_round (&_sip24_prng_state.state[0], &_sip24_prng_state.state[1],
		       &_sip24_prng_state.state[2], &_sip24_prng_state.state[3]);
  _sip24_prng_state.state[0] ^= _sip24_prng_state.v;
  for (i = 0; i < cROUNDS; ++i)
    _sip24_prng_round (&_sip24_prng_state.state[0], &_sip24_prng_state.state[1],
		       &_sip24_prng_state.state[2], &_sip24_prng_state.state[3]);
  _sip24_prng_state.state[2] ^= 0xff;
  for (i = 0; i < dROUNDS; ++i)
    _sip24_prng_round (&_sip24_prng_state.state[0], &_sip24_prng_state.state[1],
		       &_sip24_prng_state.state[2], &_sip24_prng_state.state[3]);
}

/* Some random numbers from Siphash24.  */
static const uint64_t _sip24_prng_nums[4] = {
  0x736f6d6570736575ULL, 0x646f72616e646f6dULL,
  0x6c7967656e657261ULL, 0x7465646279746573ULL,
};

/* Set the PRNG seed by K.  */
static inline void
set_sip24_prng_seed (uint64_t k[4]) {
  int i;
  
  for (i = 0; i < 4; i++)
    _sip24_prng_state.init_state[i] = k[i] ^ _sip24_prng_nums[i];
}

/* Initiate the PRNG with seed given by K.  */
static inline void
init_sip24_prng_with_seed (uint64_t k[4]) {
  set_sip24_prng_seed (k);
  _sip24_prng_state.v = 0;
  _sip24_prng_state.ind = 4;
}

/* Initiate the PRNG with some random seed.  */
static inline void
init_sip24_prng (void) {
  int i;
  uint64_t k[4];

  for (i = 0; i < 4; i++)
    k[i] = rand ();
  init_sip24_prng_with_seed (k);
}

/* Return the next pseudo-random number.  */
static inline uint64_t
get_sip24_prn (void)
{
  uint64_t res;
  
  for (;;) {
    if (_sip24_prng_state.ind < 4) {
      res = _sip24_prng_state.state[_sip24_prng_state.ind];
      _sip24_prng_state.ind++;
      return res;
    }
    _sip24_prng_state.ind = 0;
    _sip24_prng_gen ();
    _sip24_prng_state.v++; 
  }
}

/* Empty function for our PRNGs interface.  */
static inline void
finish_sip24_prng (void) {
}

#endif


================================================
FILE: benchmarks/siphash24.c
================================================
/*
   SipHash reference C implementation

   Copyright (c) 2012-2014 Jean-Philippe Aumasson
   <jeanphilippe.aumasson@gmail.com>
   Copyright (c) 2012-2014 Daniel J. Bernstein <djb@cr.yp.to>

   To the extent possible under law, the author(s) have dedicated all copyright
   and related and neighboring rights to this software to the public domain
   worldwide. This software is distributed without any warranty.

   You should have received a copy of the CC0 Public Domain Dedication along
   with
   this software. If not, see
   <http://creativecommons.org/publicdomain/zero/1.0/>.
 */
#include <stdint.h>
#include <stdio.h>
#include <string.h>

/* default: SipHash-2-4 */
#define cROUNDS 2
#define dROUNDS 4

#define ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b))))

#define U32TO8_LE(p, v)                                                        \
  (p)[0] = (uint8_t)((v));                                                     \
  (p)[1] = (uint8_t)((v) >> 8);                                                \
  (p)[2] = (uint8_t)((v) >> 16);                                               \
  (p)[3] = (uint8_t)((v) >> 24);

#define U64TO8_LE(p, v)                                                        \
  U32TO8_LE((p), (uint32_t)((v)));                                             \
  U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));

#define U8TO64_LE(p)                                                           \
  (((uint64_t)((p)[0])) | ((uint64_t)((p)[1]) << 8) |                          \
   ((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) |                   \
   ((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) |                   \
   ((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56))

#define SIPROUND                                                               \
  do {                                                                         \
    v0 += v1;                                                                  \
    v1 = ROTL(v1, 13);                                                         \
    v1 ^= v0;                                                                  \
    v0 = ROTL(v0, 32);                                                         \
    v2 += v3;                                                                  \
    v3 = ROTL(v3, 16);                                                         \
    v3 ^= v2;                                                                  \
    v0 += v3;                                                                  \
    v3 = ROTL(v3, 21);                                                         \
    v3 ^= v0;                                                                  \
    v2 += v1;                                                                  \
    v1 = ROTL(v1, 17);                                                         \
    v1 ^= v2;                                                                  \
    v2 = ROTL(v2, 32);                                                         \
  } while (0)

#ifdef DEBUG
#define TRACE                                                                  \
  do {                                                                         \
    printf("(%3d) v0 %08x %08x\n", (int)inlen, (uint32_t)(v0 >> 32),           \
           (uint32_t)v0);                                                      \
    printf("(%3d) v1 %08x %08x\n", (int)inlen, (uint32_t)(v1 >> 32),           \
           (uint32_t)v1);                                                      \
    printf("(%3d) v2 %08x %08x\n", (int)inlen, (uint32_t)(v2 >> 32),           \
           (uint32_t)v2);                                                      \
    printf("(%3d) v3 %08x %08x\n", (int)inlen, (uint32_t)(v3 >> 32),           \
           (uint32_t)v3);                                                      \
  } while (0)
#else
#define TRACE
#endif

int siphash(uint8_t *out, const uint8_t *in, uint64_t inlen, const uint8_t *k) {
  /* "somepseudorandomlygeneratedbytes" */
  uint64_t v0 = 0x736f6d6570736575ULL;
  uint64_t v1 = 0x646f72616e646f6dULL;
  uint64_t v2 = 0x6c7967656e657261ULL;
  uint64_t v3 = 0x7465646279746573ULL;
  uint64_t b;
  uint64_t k0 = U8TO64_LE(k);
  uint64_t k1 = U8TO64_LE(k + 8);
  uint64_t m;
  int i;
  const uint8_t *end = in + inlen - (inlen % sizeof(uint64_t));
  const int left = inlen & 7;
  b = ((uint64_t)inlen) << 56;
  v3 ^= k1;
  v2 ^= k0;
  v1 ^= k1;
  v0 ^= k0;

#ifdef DOUBLE
  v1 ^= 0xee;
#endif

  for (; in != end; in += 8) {
    m = U8TO64_LE(in);
    v3 ^= m;

    TRACE;
    for (i = 0; i < cROUNDS; ++i)
      SIPROUND;

    v0 ^= m;
  }

  switch (left) {
  case 7:
    b |= ((uint64_t)in[6]) << 48;
  case 6:
    b |= ((uint64_t)in[5]) << 40;
  case 5:
    b |= ((uint64_t)in[4]) << 32;
  case 4:
    b |= ((uint64_t)in[3]) << 24;
  case 3:
    b |= ((uint64_t)in[2]) << 16;
  case 2:
    b |= ((uint64_t)in[1]) << 8;
  case 1:
    b |= ((uint64_t)in[0]);
    break;
  case 0:
    break;
  }

  v3 ^= b;

  TRACE;
  for (i = 0; i < cROUNDS; ++i)
    SIPROUND;

  v0 ^= b;

#ifndef DOUBLE
  v2 ^= 0xff;
#else
  v2 ^= 0xee;
#endif

  TRACE;
  for (i = 0; i < dROUNDS; ++i)
    SIPROUND;

  b = v0 ^ v1 ^ v2 ^ v3;
  U64TO8_LE(out, b);

#ifdef DOUBLE
  v1 ^= 0xdd;

  TRACE;
  for (i = 0; i < dROUNDS; ++i)
    SIPROUND;

  b = v0 ^ v1 ^ v2 ^ v3;
  U64TO8_LE(out + 8, b);
#endif

  return 0;
}


================================================
FILE: benchmarks/splitmix64.c
================================================
/*  Written in 2015 by Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>

/* This is a fixed-increment version of Java 8's SplittableRandom generator
   See http://dx.doi.org/10.1145/2714064.2660195 and 
   http://docs.oracle.com/javase/8/docs/api/java/util/SplittableRandom.html

   It is a very fast generator passing BigCrush, and it can be useful if
   for some reason you absolutely want 64 bits of state; otherwise, we
   rather suggest to use a xoroshiro128+ (for moderately parallel
   computations) or xorshift1024* (for massively parallel computations)
   generator. */

static uint64_t x; /* The state can be seeded with any value. */

uint64_t next() {
	uint64_t z = (x += 0x9e3779b97f4a7c15);
	z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
	z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
	return z ^ (z >> 31);
}


================================================
FILE: benchmarks/t1ha/src/t1ha0.c
================================================
/*
 *  Copyright (c) 2016-2020 Positive Technologies, https://www.ptsecurity.com,
 *  Fast Positive Hash.
 *
 *  Portions Copyright (c) 2010-2020 Leonid Yuriev <leo@yuriev.ru>,
 *  The 1Hippeus project (t1h).
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty. In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgement in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 */

/*
 * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
 * by [Positive Technologies](https://www.ptsecurity.ru)
 *
 * Briefly, it is a 64-bit Hash Function:
 *  1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
 *     but portable and without penalties it can run on any 64-bit CPU.
 *  2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
 *     and all others portable hash-functions (which do not use specific
 *     hardware tricks).
 *  3. Not suitable for cryptography.
 *
 * The Future will (be) Positive. Всё будет хорошо.
 *
 * ACKNOWLEDGEMENT:
 * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
 * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
 */

#ifndef T1HA0_DISABLED
#include "t1ha_bits.h"
#include "t1ha_selfcheck.h"

static __maybe_unused __always_inline uint32_t tail32_le_aligned(const void *v,
                                                                 size_t tail) {
  const uint8_t *const p = (const uint8_t *)v;
#if T1HA_USE_FAST_ONESHOT_READ && !defined(__SANITIZE_ADDRESS__)
  /* We can perform a 'oneshot' read, which is little bit faster. */
  const unsigned shift = ((4 - tail) & 3) << 3;
  return fetch32_le_aligned(p) & ((~UINT32_C(0)) >> shift);
#else
  uint32_t r = 0;
  switch (tail & 3) {
  default:
    unreachable();
/* fall through */
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  /* For most CPUs this code is better when not needed
   * copying for alignment or byte reordering. */
  case 0:
    return fetch32_le_aligned(p);
  case 3:
    r = (uint32_t)p[2] << 16;
  /* fall through */
  case 2:
    return r + fetch16_le_aligned(p);
  case 1:
    return p[0];
#else
  case 0:
    r += p[3];
    r <<= 8;
  /* fall through */
  case 3:
    r += p[2];
    r <<= 8;
  /* fall through */
  case 2:
    r += p[1];
    r <<= 8;
  /* fall through */
  case 1:
    return r + p[0];
#endif
  }
#endif /* T1HA_USE_FAST_ONESHOT_READ */
}

static __maybe_unused __always_inline uint32_t
tail32_le_unaligned(const void *v, size_t tail) {
  const uint8_t *p = (const uint8_t *)v;
#ifdef can_read_underside
  /* On some systems (e.g. x86) we can perform a 'oneshot' read, which
   * is little bit faster. Thanks Marcin Żukowski <marcin.zukowski@gmail.com>
   * for the reminder. */
  const unsigned offset = (4 - tail) & 3;
  const unsigned shift = offset << 3;
  if (likely(can_read_underside(p, 4))) {
    p -= offset;
    return fetch32_le_unaligned(p) >> shift;
  }
  return fetch32_le_unaligned(p) & ((~UINT32_C(0)) >> shift);
#else
  uint32_t r = 0;
  switch (tail & 3) {
  default:
    unreachable();
/* fall through */
#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT &&           \
    __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  /* For most CPUs this code is better when not needed
   * copying for alignment or byte reordering. */
  case 0:
    return fetch32_le_unaligned(p);
  case 3:
    r = (uint32_t)p[2] << 16;
  /* fall through */
  case 2:
    return r + fetch16_le_unaligned(p);
  case 1:
    return p[0];
#else
  /* For most CPUs this code is better than a
   * copying for alignment and/or byte reordering. */
  case 0:
    r += p[3];
    r <<= 8;
  /* fall through */
  case 3:
    r += p[2];
    r <<= 8;
  /* fall through */
  case 2:
    r += p[1];
    r <<= 8;
  /* fall through */
  case 1:
    return r + p[0];
#endif
  }
#endif /* can_read_underside */
}

static __maybe_unused __always_inline uint32_t tail32_be_aligned(const void *v,
                                                                 size_t tail) {
  const uint8_t *const p = (const uint8_t *)v;
#if T1HA_USE_FAST_ONESHOT_READ && !defined(__SANITIZE_ADDRESS__)
  /* We can perform a 'oneshot' read, which is little bit faster. */
  const unsigned shift = ((4 - tail) & 3) << 3;
  return fetch32_be_aligned(p) >> shift;
#else
  switch (tail & 3) {
  default:
    unreachable();
/* fall through */
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  /* For most CPUs this code is better when not needed
   * copying for alignment or byte reordering. */
  case 1:
    return p[0];
  case 2:
    return fetch16_be_aligned(p);
  case 3:
    return fetch16_be_aligned(p) << 8 | p[2];
  case 0:
    return fetch32_be_aligned(p);
#else
  case 1:
    return p[0];
  case 2:
    return p[1] | (uint32_t)p[0] << 8;
  case 3:
    return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16;
  case 0:
    return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 |
           (uint32_t)p[0] << 24;
#endif
  }
#endif /* T1HA_USE_FAST_ONESHOT_READ */
}

static __maybe_unused __always_inline uint32_t
tail32_be_unaligned(const void *v, size_t tail) {
  const uint8_t *p = (const uint8_t *)v;
#ifdef can_read_underside
  /* On some systems we can perform a 'oneshot' read, which is little bit
   * faster. Thanks Marcin Żukowski <marcin.zukowski@gmail.com> for the
   * reminder. */
  const unsigned offset = (4 - tail) & 3;
  const unsigned shift = offset << 3;
  if (likely(can_read_underside(p, 4))) {
    p -= offset;
    return fetch32_be_unaligned(p) & ((~UINT32_C(0)) >> shift);
  }
  return fetch32_be_unaligned(p) >> shift;
#else
  switch (tail & 3) {
  default:
    unreachable();
/* fall through */
#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT &&           \
    __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  /* For most CPUs this code is better when not needed
   * copying for alignment or byte reordering. */
  case 1:
    return p[0];
  case 2:
    return fetch16_be_unaligned(p);
  case 3:
    return fetch16_be_unaligned(p) << 8 | p[2];
  case 0:
    return fetch32_be_unaligned(p);
#else
  /* For most CPUs this code is better than a
   * copying for alignment and/or byte reordering. */
  case 1:
    return p[0];
  case 2:
    return p[1] | (uint32_t)p[0] << 8;
  case 3:
    return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16;
  case 0:
    return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 |
           (uint32_t)p[0] << 24;
#endif
  }
#endif /* can_read_underside */
}

/***************************************************************************/

#ifndef rot32
static __maybe_unused __always_inline uint32_t rot32(uint32_t v, unsigned s) {
  return (v >> s) | (v << (32 - s));
}
#endif /* rot32 */

static __always_inline void mixup32(uint32_t *a, uint32_t *b, uint32_t v,
                                    uint32_t prime) {
  uint64_t l = mul_32x32_64(*b + v, prime);
  *a ^= (uint32_t)l;
  *b += (uint32_t)(l >> 32);
}

static __always_inline uint64_t final32(uint32_t a, uint32_t b) {
  uint64_t l = (b ^ rot32(a, 13)) | (uint64_t)a << 32;
  l *= prime_0;
  l ^= l >> 41;
  l *= prime_4;
  l ^= l >> 47;
  l *= prime_6;
  return l;
}

/* 32-bit 'magic' primes */
static const uint32_t prime32_0 = UINT32_C(0x92D78269);
static const uint32_t prime32_1 = UINT32_C(0xCA9B4735);
static const uint32_t prime32_2 = UINT32_C(0xA4ABA1C3);
static const uint32_t prime32_3 = UINT32_C(0xF6499843);
static const uint32_t prime32_4 = UINT32_C(0x86F0FD61);
static const uint32_t prime32_5 = UINT32_C(0xCA2DA6FB);
static const uint32_t prime32_6 = UINT32_C(0xC4BB3575);

/* TODO: C++ template in the next version */
#define T1HA0_BODY(ENDIANNES, ALIGNESS)                                        \
  const uint32_t *v = (const uint32_t *)data;                                  \
  if (unlikely(len > 16)) {                                                    \
    uint32_t c = ~a;                                                           \
    uint32_t d = rot32(b, 5);                                                  \
    const uint32_t *detent =                                                   \
        (const uint32_t *)((const uint8_t *)data + len - 15);                  \
    do {                                                                       \
      const uint32_t w0 = fetch32_##ENDIANNES##_##ALIGNESS(v + 0);             \
      const uint32_t w1 = fetch32_##ENDIANNES##_##ALIGNESS(v + 1);             \
      const uint32_t w2 = fetch32_##ENDIANNES##_##ALIGNESS(v + 2);             \
      const uint32_t w3 = fetch32_##ENDIANNES##_##ALIGNESS(v + 3);             \
      v += 4;                                                                  \
      prefetch(v);                                                             \
                                                                               \
      const uint32_t d13 = w1 + rot32(w3 + d, 17);                             \
      const uint32_t c02 = w0 ^ rot32(w2 + c, 11);                             \
      d ^= rot32(a + w0, 3);                                                   \
      c ^= rot32(b + w1, 7);                                                   \
      b = prime32_1 * (c02 + w3);                                              \
      a = prime32_0 * (d13 ^ w2);                                              \
    } while (likely(v < detent));                                              \
                                                                               \
    c += a;                                                                    \
    d += b;                                                                    \
    a ^= prime32_6 * (rot32(c, 16) + d);                                       \
    b ^= prime32_5 * (c + rot32(d, 16));                                       \
                                                                               \
    len &= 15;                                                                 \
  }                                                                            \
                                                                               \
  switch (len) {                                                               \
  default:                                                                     \
    mixup32(&a, &b, fetch32_##ENDIANNES##_##ALIGNESS(v++), prime32_4);         \
  /* fall through */                                                           \
  case 12:                                                                     \
  case 11:                                                                     \
  case 10:                                                                     \
  case 9:                                                                      \
    mixup32(&b, &a, fetch32_##ENDIANNES##_##ALIGNESS(v++), prime32_3);         \
  /* fall through */                                                           \
  case 8:                                                                      \
  case 7:                                                                      \
  case 6:                                                                      \
  case 5:                                                                      \
    mixup32(&a, &b, fetch32_##ENDIANNES##_##ALIGNESS(v++), prime32_2);         \
  /* fall through */                                                           \
  case 4:                                                                      \
  case 3:                                                                      \
  case 2:                                                                      \
  case 1:                                                                      \
    mixup32(&b, &a, tail32_##ENDIANNES##_##ALIGNESS(v, len), prime32_1);       \
  /* fall through */                                                           \
  case 0:                                                                      \
    return final32(a, b);                                                      \
  }

uint64_t t1ha0_32le(const void *data, size_t len, uint64_t seed) {
  uint32_t a = rot32((uint32_t)len, 17) + (uint32_t)seed;
  uint32_t b = (uint32_t)len ^ (uint32_t)(seed >> 32);

#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT
  T1HA0_BODY(le, unaligned);
#else
  const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_32 - 1)) != 0;
  if (misaligned) {
    T1HA0_BODY(le, unaligned);
  } else {
    T1HA0_BODY(le, aligned);
  }
#endif
}

uint64_t t1ha0_32be(const void *data, size_t len, uint64_t seed) {
  uint32_t a = rot32((uint32_t)len, 17) + (uint32_t)seed;
  uint32_t b = (uint32_t)len ^ (uint32_t)(seed >> 32);

#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT
  T1HA0_BODY(be, unaligned);
#else
  const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_32 - 1)) != 0;
  if (misaligned) {
    T1HA0_BODY(be, unaligned);
  } else {
    T1HA0_BODY(be, aligned);
  }
#endif
}

/***************************************************************************/

#if T1HA0_AESNI_AVAILABLE && defined(__ia32__)
__cold uint64_t t1ha_ia32cpu_features(void) {
  uint32_t features = 0;
  uint32_t extended = 0;
#ifdef __GNUC__
  uint32_t eax, ebx, ecx, edx;
  const unsigned cpuid_max = __get_cpuid_max(0, NULL);
  if (cpuid_max >= 1) {
    __cpuid_count(1, 0, eax, ebx, features, edx);
    if (cpuid_max >= 7)
      __cpuid_count(7, 0, eax, extended, ecx, edx);
  }
#elif defined(_MSC_VER)
  int info[4];
  __cpuid(info, 0);
  const unsigned cpuid_max = info[0];
  if (cpuid_max >= 1) {
    __cpuidex(info, 1, 0);
    features = info[2];
    if (cpuid_max >= 7) {
      __cpuidex(info, 7, 0);
      extended = info[1];
    }
  }
#endif
  return features | (uint64_t)extended << 32;
}
#endif /* T1HA0_AESNI_AVAILABLE && __ia32__ */

#if T1HA0_RUNTIME_SELECT

__cold t1ha0_function_t t1ha0_resolve(void) {

#if T1HA0_AESNI_AVAILABLE && defined(__ia32__)
  uint64_t features = t1ha_ia32cpu_features();
  if (t1ha_ia32_AESNI_avail(features)) {
    if (t1ha_ia32_AVX_avail(features))
      return t1ha_ia32_AVX2_avail(features) ? t1ha0_ia32aes_avx2
                                            : t1ha0_ia32aes_avx;
    return t1ha0_ia32aes_noavx;
  }
#endif /* T1HA0_AESNI_AVAILABLE && __ia32__ */

#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#if (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) &&                \
    (!defined(T1HA1_DISABLED) || !defined(T1HA2_DISABLED))
#ifndef T1HA1_DISABLED
  return t1ha1_be;
#else
  return t1ha2_atonce;
#endif /* T1HA1_DISABLED */
#else
  return t1ha0_32be;
#endif
#else /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */
#if (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) &&                \
    (!defined(T1HA1_DISABLED) || !defined(T1HA2_DISABLED))
#ifndef T1HA1_DISABLED
  return t1ha1_le;
#else
  return t1ha2_atonce;
#endif /* T1HA1_DISABLED */
#else
  return t1ha0_32le;
#endif
#endif /* __BYTE_ORDER__ */
}

#if T1HA_USE_INDIRECT_FUNCTIONS
/* Use IFUNC (GNU ELF indirect functions) to choice implementation at runtime.
 * For more info please see
 * https://en.wikipedia.org/wiki/Executable_and_Linkable_Format
 * and https://sourceware.org/glibc/wiki/GNU_IFUNC */
#if __has_attribute(__ifunc__)
uint64_t t1ha0(const void *data, size_t len, uint64_t seed)
    __attribute__((__ifunc__("t1ha0_resolve")));
#else
__asm("\t.globl\tt1ha0\n\t.type\tt1ha0, "
      "%gnu_indirect_function\n\t.set\tt1ha0,t1ha0_resolve");
#endif /* __has_attribute(__ifunc__) */

#elif __GNUC_PREREQ(4, 0) || __has_attribute(__constructor__)

uint64_t (*t1ha0_funcptr)(const void *, size_t, uint64_t);

static __cold void __attribute__((__constructor__)) t1ha0_init(void) {
  t1ha0_funcptr = t1ha0_resolve();
}

#else /* T1HA_USE_INDIRECT_FUNCTIONS */

static __cold uint64_t t1ha0_proxy(const void *data, size_t len,
                                   uint64_t seed) {
  t1ha0_funcptr = t1ha0_resolve();
  return t1ha0_funcptr(data, len, seed);
}

uint64_t (*t1ha0_funcptr)(const void *, size_t, uint64_t) = t1ha0_proxy;

#endif /* !T1HA_USE_INDIRECT_FUNCTIONS */
#endif /* T1HA0_RUNTIME_SELECT */

#endif /* T1HA0_DISABLED */


================================================
FILE: benchmarks/t1ha/src/t1ha0_ia32aes_a.h
================================================
/*
 *  Copyright (c) 2016-2020 Positive Technologies, https://www.ptsecurity.com,
 *  Fast Positive Hash.
 *
 *  Portions Copyright (c) 2010-2020 Leonid Yuriev <leo@yuriev.ru>,
 *  The 1Hippeus project (t1h).
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty. In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgement in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 */

/*
 * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
 * by [Positive Technologies](https://www.ptsecurity.ru)
 *
 * Briefly, it is a 64-bit Hash Function:
 *  1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
 *     but portable and without penalties it can run on any 64-bit CPU.
 *  2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
 *     and all others portable hash-functions (which do not use specific
 *     hardware tricks).
 *  3. Not suitable for cryptography.
 *
 * The Future will (be) Positive. Всё будет хорошо.
 *
 * ACKNOWLEDGEMENT:
 * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
 * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
 */

#include "t1ha_bits.h"
#include "t1ha_selfcheck.h"

#if T1HA0_AESNI_AVAILABLE

uint64_t T1HA_IA32AES_NAME(const void *data, size_t len, uint64_t seed) {
  uint64_t a = seed;
  uint64_t b = len;

  if (unlikely(len > 32)) {
    __m128i x = _mm_set_epi64x(a, b);
    __m128i y = _mm_aesenc_si128(x, _mm_set_epi64x(prime_5, prime_6));

    const __m128i *__restrict v = (const __m128i *)data;
    const __m128i *__restrict const detent =
        (const __m128i *)((const uint8_t *)data + len - 127);

    while (v < detent) {
      __m128i v0 = _mm_loadu_si128(v + 0);
      __m128i v1 = _mm_loadu_si128(v + 1);
      __m128i v2 = _mm_loadu_si128(v + 2);
      __m128i v3 = _mm_loadu_si128(v + 3);
      __m128i v4 = _mm_loadu_si128(v + 4);
      __m128i v5 = _mm_loadu_si128(v + 5);
      __m128i v6 = _mm_loadu_si128(v + 6);
      __m128i v7 = _mm_loadu_si128(v + 7);

      __m128i v0y = _mm_aesenc_si128(v0, y);
      __m128i v2x6 = _mm_aesenc_si128(v2, _mm_xor_si128(x, v6));
      __m128i v45_67 =
          _mm_xor_si128(_mm_aesenc_si128(v4, v5), _mm_add_epi64(v6, v7));

      __m128i v0y7_1 = _mm_aesdec_si128(_mm_sub_epi64(v7, v0y), v1);
      __m128i v2x6_3 = _mm_aesenc_si128(v2x6, v3);

      x = _mm_aesenc_si128(v45_67, _mm_add_epi64(x, y));
      y = _mm_aesenc_si128(v2x6_3, _mm_xor_si128(v0y7_1, v5));
      v += 8;
    }

    if (len & 64) {
      __m128i v0y = _mm_add_epi64(y, _mm_loadu_si128(v++));
      __m128i v1x = _mm_sub_epi64(x, _mm_loadu_si128(v++));
      x = _mm_aesdec_si128(x, v0y);
      y = _mm_aesdec_si128(y, v1x);

      __m128i v2y = _mm_add_epi64(y, _mm_loadu_si128(v++));
      __m128i v3x = _mm_sub_epi64(x, _mm_loadu_si128(v++));
      x = _mm_aesdec_si128(x, v2y);
      y = _mm_aesdec_si128(y, v3x);
    }

    if (len & 32) {
      __m128i v0y = _mm_add_epi64(y, _mm_loadu_si128(v++));
      __m128i v1x = _mm_sub_epi64(x, _mm_loadu_si128(v++));
      x = _mm_aesdec_si128(x, v0y);
      y = _mm_aesdec_si128(y, v1x);
    }

    if (len & 16) {
      y = _mm_add_epi64(x, y);
      x = _mm_aesdec_si128(x, _mm_loadu_si128(v++));
    }

    x = _mm_add_epi64(_mm_aesdec_si128(x, _mm_aesenc_si128(y, x)), y);
#if defined(__x86_64__) || defined(_M_X64)
#if defined(__SSE4_1__) || defined(__AVX__)
    a = _mm_extract_epi64(x, 0);
    b = _mm_extract_epi64(x, 1);
#else
    a = _mm_cvtsi128_si64(x);
    b = _mm_cvtsi128_si64(_mm_unpackhi_epi64(x, x));
#endif
#else
#if defined(__SSE4_1__) || defined(__AVX__)
    a = (uint32_t)_mm_extract_epi32(x, 0) | (uint64_t)_mm_extract_epi32(x, 1)
                                                << 32;
    b = (uint32_t)_mm_extract_epi32(x, 2) | (uint64_t)_mm_extract_epi32(x, 3)
                                                << 32;
#else
    a = (uint32_t)_mm_cvtsi128_si32(x);
    a |= (uint64_t)_mm_cvtsi128_si32(_mm_shuffle_epi32(x, 1)) << 32;
    x = _mm_unpackhi_epi64(x, x);
    b = (uint32_t)_mm_cvtsi128_si32(x);
    b |= (uint64_t)_mm_cvtsi128_si32(_mm_shuffle_epi32(x, 1)) << 32;
#endif
#endif
#ifdef __AVX__
    _mm256_zeroupper();
#elif !(defined(_X86_64_) || defined(__x86_64__) || defined(_M_X64) ||         \
        defined(__e2k__))
    _mm_empty();
#endif
    data = v;
    len &= 15;
  }

  const uint64_t *v = (const uint64_t *)data;
  switch (len) {
  default:
    mixup64(&a, &b, fetch64_le_unaligned(v++), prime_4);
  /* fall through */
  case 24:
  case 23:
  case 22:
  case 21:
  case 20:
  case 19:
  case 18:
  case 17:
    mixup64(&b, &a, fetch64_le_unaligned(v++), prime_3);
  /* fall through */
  case 16:
  case 15:
  case 14:
  case 13:
  case 12:
  case 11:
  case 10:
  case 9:
    mixup64(&a, &b, fetch64_le_unaligned(v++), prime_2);
  /* fall through */
  case 8:
  case 7:
  case 6:
  case 5:
  case 4:
  case 3:
  case 2:
  case 1:
    mixup64(&b, &a, tail64_le_unaligned(v, len), prime_1);
  /* fall through */
  case 0:
    return final64(a, b);
  }
}

#endif /* T1HA0_AESNI_AVAILABLE */
#undef T1HA_IA32AES_NAME


================================================
FILE: benchmarks/t1ha/src/t1ha0_ia32aes_avx.c
================================================
#ifndef T1HA0_DISABLED
#define T1HA_IA32AES_NAME t1ha0_ia32aes_avx
#include "t1ha0_ia32aes_a.h"
#endif /* T1HA0_DISABLED */


================================================
FILE: benchmarks/t1ha/src/t1ha0_ia32aes_avx2.c
================================================
#ifndef T1HA0_DISABLED
#define T1HA_IA32AES_NAME t1ha0_ia32aes_avx2
#include "t1ha0_ia32aes_b.h"
#endif /* T1HA0_DISABLED */


================================================
FILE: benchmarks/t1ha/src/t1ha0_ia32aes_b.h
================================================
/*
 *  Copyright (c) 2016-2020 Positive Technologies, https://www.ptsecurity.com,
 *  Fast Positive Hash.
 *
 *  Portions Copyright (c) 2010-2020 Leonid Yuriev <leo@yuriev.ru>,
 *  The 1Hippeus project (t1h).
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty. In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgement in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 */

/*
 * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
 * by [Positive Technologies](https://www.ptsecurity.ru)
 *
 * Briefly, it is a 64-bit Hash Function:
 *  1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
 *     but portable and without penalties it can run on any 64-bit CPU.
 *  2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
 *     and all others portable hash-functions (which do not use specific
 *     hardware tricks).
 *  3. Not suitable for cryptography.
 *
 * The Future will (be) Positive. Всё будет хорошо.
 *
 * ACKNOWLEDGEMENT:
 * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
 * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
 */

#include "t1ha_bits.h"
#include "t1ha_selfcheck.h"

#if T1HA0_AESNI_AVAILABLE

uint64_t T1HA_IA32AES_NAME(const void *data, size_t len, uint64_t seed) {
  uint64_t a = seed;
  uint64_t b = len;

  if (unlikely(len > 32)) {
    __m128i x = _mm_set_epi64x(a, b);
    __m128i y = _mm_aesenc_si128(x, _mm_set_epi64x(prime_0, prime_1));

    const __m128i *v = (const __m128i *)data;
    const __m128i *const detent =
        (const __m128i *)((const uint8_t *)data + (len & ~15ul));
    data = detent;

    if (len & 16) {
      x = _mm_add_epi64(x, _mm_loadu_si128(v++));
      y = _mm_aesenc_si128(x, y);
    }
    len &= 15;

    if (v + 7 < detent) {
      __m128i salt = y;
      do {
        __m128i t = _mm_aesenc_si128(_mm_loadu_si128(v++), salt);
        t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));
        t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));
        t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));

        t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));
        t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));
        t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));
        t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));

        salt = _mm_add_epi64(salt, _mm_set_epi64x(prime_5, prime_6));
        t = _mm_aesenc_si128(x, t);
        x = _mm_add_epi64(y, x);
        y = t;
      } while (v + 7 < detent);
    }

    while (v < detent) {
      __m128i v0y = _mm_add_epi64(y, _mm_loadu_si128(v++));
      __m128i v1x = _mm_sub_epi64(x, _mm_loadu_si128(v++));
      x = _mm_aesdec_si128(x, v0y);
      y = _mm_aesdec_si128(y, v1x);
    }

    x = _mm_add_epi64(_mm_aesdec_si128(x, _mm_aesenc_si128(y, x)), y);
#if defined(__x86_64__) || defined(_M_X64)
#if defined(__SSE4_1__) || defined(__AVX__)
    a = _mm_extract_epi64(x, 0);
    b = _mm_extract_epi64(x, 1);
#else
    a = _mm_cvtsi128_si64(x);
    b = _mm_cvtsi128_si64(_mm_unpackhi_epi64(x, x));
#endif
#else
#if defined(__SSE4_1__) || defined(__AVX__)
    a = (uint32_t)_mm_extract_epi32(x, 0) | (uint64_t)_mm_extract_epi32(x, 1)
                                                << 32;
    b = (uint32_t)_mm_extract_epi32(x, 2) | (uint64_t)_mm_extract_epi32(x, 3)
                                                << 32;
#else
    a = (uint32_t)_mm_cvtsi128_si32(x);
    a |= (uint64_t)_mm_cvtsi128_si32(_mm_shuffle_epi32(x, 1)) << 32;
    x = _mm_unpackhi_epi64(x, x);
    b = (uint32_t)_mm_cvtsi128_si32(x);
    b |= (uint64_t)_mm_cvtsi128_si32(_mm_shuffle_epi32(x, 1)) << 32;
#endif
#endif
#ifdef __AVX__
    _mm256_zeroupper();
#elif !(defined(_X86_64_) || defined(__x86_64__) || defined(_M_X64) ||         \
        defined(__e2k__))
    _mm_empty();
#endif
  }

  const uint64_t *v = (const uint64_t *)data;
  switch (len) {
  default:
    mixup64(&a, &b, fetch64_le_unaligned(v++), prime_4);
  /* fall through */
  case 24:
  case 23:
  case 22:
  case 21:
  case 20:
  case 19:
  case 18:
  case 17:
    mixup64(&b, &a, fetch64_le_unaligned(v++), prime_3);
  /* fall through */
  case 16:
  case 15:
  case 14:
  case 13:
  case 12:
  case 11:
  case 10:
  case 9:
    mixup64(&a, &b, fetch64_le_unaligned(v++), prime_2);
  /* fall through */
  case 8:
  case 7:
  case 6:
  case 5:
  case 4:
  case 3:
  case 2:
  case 1:
    mixup64(&b, &a, tail64_le_unaligned(v, len), prime_1);
  /* fall through */
  case 0:
    return final64(a, b);
  }
}

#endif /* T1HA0_AESNI_AVAILABLE */
#undef T1HA_IA32AES_NAME


================================================
FILE: benchmarks/t1ha/src/t1ha0_ia32aes_noavx.c
================================================
#ifndef T1HA0_DISABLED
#define T1HA_IA32AES_NAME t1ha0_ia32aes_noavx
#include "t1ha0_ia32aes_a.h"
#endif /* T1HA0_DISABLED */


================================================
FILE: benchmarks/t1ha/src/t1ha0_selfcheck.c
================================================
/*
 *  Copyright (c) 2016-2020 Positive Technologies, https://www.ptsecurity.com,
 *  Fast Positive Hash.
 *
 *  Portions Copyright (c) 2010-2020 Leonid Yuriev <leo@yuriev.ru>,
 *  The 1Hippeus project (t1h).
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty. In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgement in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 */

/*
 * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
 * by [Positive Technologies](https://www.ptsecurity.ru)
 *
 * Briefly, it is a 64-bit Hash Function:
 *  1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
 *     but portable and without penalties it can run on any 64-bit CPU.
 *  2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
 *     and all others portable hash-functions (which do not use specific
 *     hardware tricks).
 *  3. Not suitable for cryptography.
 *
 * The Future will (be) Positive. Всё будет хорошо.
 *
 * ACKNOWLEDGEMENT:
 * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
 * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
 */

#ifndef T1HA0_DISABLED
#include "t1ha_bits.h"
#include "t1ha_selfcheck.h"

/* *INDENT-OFF* */
/* clang-format off */

const uint64_t t1ha_refval_32le[81] = { 0,
  0xC92229C10FAEA50E, 0x3DF1354B0DFDC443, 0x968F016D60417BB3, 0x85AAFB50C6DA770F,
  0x66CCE3BB6842C7D6, 0xDDAA39C11537C226, 0x35958D281F0C9C8C, 0x8C5D64B091DE608E,
  0x4094DF680D39786B, 0x1014F4AA2A2EDF4D, 0x39D21891615AA310, 0x7EF51F67C398C7C4,
  0x06163990DDBF319D, 0xE229CAA00C8D6F3F, 0xD2240B4B0D54E0F5, 0xEA2E7E905DDEAF94,
  0x8D4F8A887183A5CE, 0x44337F9A63C5820C, 0x94938D1E86A9B797, 0x96E9CABA5CA210CC,
  0x6EFBB9CC9E8F7708, 0x3D12EA0282FB8BBC, 0x5DA781EE205A2C48, 0xFA4A51A12677FE12,
  0x81D5F04E20660B28, 0x57258D043BCD3841, 0x5C9BEB62059C1ED2, 0x57A02162F9034B33,
  0xBA2A13E457CE19B8, 0xE593263BF9451F3A, 0x0BC1175539606BC5, 0xA3E2929E9C5F289F,
  0x86BDBD06835E35F7, 0xA180950AB48BAADC, 0x7812C994D9924028, 0x308366011415F46B,
  0x77FE9A9991C5F959, 0x925C340B70B0B1E3, 0xCD9C5BA4C41E2E10, 0x7CC4E7758B94CD93,
  0x898B235962EA4625, 0xD7E3E5BF22893286, 0x396F4CDD33056C64, 0x740AB2E32F17CD9F,
  0x60D12FF9CD15B321, 0xBEE3A6C9903A81D8, 0xB47040913B33C35E, 0x19EE8C2ACC013CFF,
  0x5DEC94C5783B55C4, 0x78DC122D562C5F1D, 0x6520F008DA1C181E, 0x77CAF155A36EBF7C,
  0x0A09E02BDB883CA6, 0xFD5D9ADA7E3FB895, 0xC6F5FDD9EEAB83B5, 0x84589BB29F52A92A,
  0x9B2517F13F8E9814, 0x6F752AF6A52E31EC, 0x8E717799E324CE8A, 0x84D90AEF39262D58,
  0x79C27B13FC28944D, 0xE6D6DF6438E0044A, 0x51B603E400D79CA4, 0x6A902B28C588B390,
  0x8D7F8DE9E6CB1D83, 0xCF1A4DC11CA7F044, 0xEF02E43C366786F1, 0x89915BCDBCFBE30F,
  0x5928B306F1A9CC7F, 0xA8B59092996851C5, 0x22050A20427E8B25, 0x6E6D64018941E7EE,
  0x9798C898B81AE846, 0x80EF218CDC30124A, 0xFCE45E60D55B0284, 0x4010E735D3147C35,
  0xEB647D999FD8DC7E, 0xD3544DCAB14FE907, 0xB588B27D8438700C, 0xA49EBFC43E057A4C
};

const uint64_t t1ha_refval_32be[81] = { 0,
  0xC92229C10FAEA50E, 0x0FE212630DD87E0F, 0x968F016D60417BB3, 0xE6B12B2C889913AB,
  0xAA3787887A9DA368, 0x06EE7202D53CEF39, 0x6149AFB2C296664B, 0x86C893210F9A5805,
  0x8379E5DA988AA04C, 0x24763AA7CE411A60, 0x9CF9C64B395A4CF8, 0xFFC192C338DDE904,
  0x094575BAB319E5F5, 0xBBBACFE7728C6511, 0x36B8C3CEBE4EF409, 0xAA0BA8A3397BA4D0,
  0xF9F85CF7124EE653, 0x3ADF4F7DF2A887AE, 0xAA2A0F5964AA9A7A, 0xF18B563F42D36EB8,
  0x034366CEF8334F5C, 0xAE2E85180E330E5F, 0xA5CE9FBFDF5C65B8, 0x5E509F25A9CA9B0B,
  0xE30D1358C2013BD2, 0xBB3A04D5EB8111FE, 0xB04234E82A15A28D, 0x87426A56D0EA0E2F,
  0x095086668E07F9F8, 0xF4CD3A43B6A6AEA5, 0x73F9B9B674D472A6, 0x558344229A1E4DCF,
  0x0AD4C95B2279181A, 0x5E3D19D80821CA6B, 0x652492D25BEBA258, 0xEFA84B02EAB849B1,
  0x81AD2D253059AC2C, 0x1400CCB0DFB2F457, 0x5688DC72A839860E, 0x67CC130E0FD1B0A7,
  0x0A851E3A94E21E69, 0x2EA0000B6A073907, 0xAE9776FF9BF1D02E, 0xC0A96B66B160631C,
  0xA93341DE4ED7C8F0, 0x6FBADD8F5B85E141, 0xB7D295F1C21E0CBA, 0x6D6114591B8E434F,
  0xF5B6939B63D97BE7, 0x3C80D5053F0E5DB4, 0xAC520ACC6B73F62D, 0xD1051F5841CF3966,
  0x62245AEA644AE760, 0x0CD56BE15497C62D, 0x5BB93435C4988FB6, 0x5FADB88EB18DB512,
  0xC897CAE2242475CC, 0xF1A094EF846DC9BB, 0x2B1D8B24924F79B6, 0xC6DF0C0E8456EB53,
  0xE6A40128303A9B9C, 0x64D37AF5EFFA7BD9, 0x90FEB70A5AE2A598, 0xEC3BA5F126D9FF4B,
  0x3121C8EC3AC51B29, 0x3B41C4D422166EC1, 0xB4878DDCBF48ED76, 0x5CB850D77CB762E4,
  0x9A27A43CC1DD171F, 0x2FDFFC6F99CB424A, 0xF54A57E09FDEA7BB, 0x5F78E5EE2CAB7039,
  0xB8BA95883DB31CBA, 0x131C61EB84AF86C3, 0x84B1F64E9C613DA7, 0xE94C1888C0C37C02,
  0xEA08F8BFB2039CDE, 0xCCC6D04D243EC753, 0x8977D105298B0629, 0x7AAA976494A5905E
};

#if T1HA0_AESNI_AVAILABLE
const uint64_t t1ha_refval_ia32aes_a[81] = { 0,
  0x772C7311BE32FF42, 0xB231AC660E5B23B5, 0x71F6DF5DA3B4F532, 0x555859635365F660,
  0xE98808F1CD39C626, 0x2EB18FAF2163BB09, 0x7B9DD892C8019C87, 0xE2B1431C4DA4D15A,
  0x1984E718A5477F70, 0x08DD17B266484F79, 0x4C83A05D766AD550, 0x92DCEBB131D1907D,
  0xD67BC6FC881B8549, 0xF6A9886555FBF66B, 0x6E31616D7F33E25E, 0x36E31B7426E3049D,
  0x4F8E4FAF46A13F5F, 0x03EB0CB3253F819F, 0x636A7769905770D2, 0x3ADF3781D16D1148,
  0x92D19CB1818BC9C2, 0x283E68F4D459C533, 0xFA83A8A88DECAA04, 0x8C6F00368EAC538C,
  0x7B66B0CF3797B322, 0x5131E122FDABA3FF, 0x6E59FF515C08C7A9, 0xBA2C5269B2C377B0,
  0xA9D24FD368FE8A2B, 0x22DB13D32E33E891, 0x7B97DFC804B876E5, 0xC598BDFCD0E834F9,
  0xB256163D3687F5A7, 0x66D7A73C6AEF50B3, 0xBB34C6A4396695D2, 0x7F46E1981C3256AD,
  0x4B25A9B217A6C5B4, 0x7A0A6BCDD2321DA9, 0x0A1F55E690A7B44E, 0x8F451A91D7F05244,
  0x624D5D3C9B9800A7, 0x09DDC2B6409DDC25, 0x3E155765865622B6, 0x96519FAC9511B381,
  0x512E58482FE4FBF0, 0x1AB260EA7D54AE1C, 0x67976F12CC28BBBD, 0x0607B5B2E6250156,
  0x7E700BEA717AD36E, 0x06A058D9D61CABB3, 0x57DA5324A824972F, 0x1193BA74DBEBF7E7,
  0xC18DC3140E7002D4, 0x9F7CCC11DFA0EF17, 0xC487D6C20666A13A, 0xB67190E4B50EF0C8,
  0xA53DAA608DF0B9A5, 0x7E13101DE87F9ED3, 0x7F8955AE2F05088B, 0x2DF7E5A097AD383F,
  0xF027683A21EA14B5, 0x9BB8AEC3E3360942, 0x92BE39B54967E7FE, 0x978C6D332E7AFD27,
  0xED512FE96A4FAE81, 0x9E1099B8140D7BA3, 0xDFD5A5BE1E6FE9A6, 0x1D82600E23B66DD4,
  0x3FA3C3B7EE7B52CE, 0xEE84F7D2A655EF4C, 0x2A4361EC769E3BEB, 0x22E4B38916636702,
  0x0063096F5D39A115, 0x6C51B24DAAFA5434, 0xBAFB1DB1B411E344, 0xFF529F161AE0C4B0,
  0x1290EAE3AC0A686F, 0xA7B0D4585447D1BE, 0xAED3D18CB6CCAD53, 0xFC73D46F8B41BEC6
};

const uint64_t t1ha_refval_ia32aes_b[81] = { 0,
  0x772C7311BE32FF42, 0x4398F62A8CB6F72A, 0x71F6DF5DA3B4F532, 0x555859635365F660,
  0xE98808F1CD39C626, 0x2EB18FAF2163BB09, 0x7B9DD892C8019C87, 0xE2B1431C4DA4D15A,
  0x1984E718A5477F70, 0x08DD17B266484F79, 0x4C83A05D766AD550, 0x92DCEBB131D1907D,
  0xD67BC6FC881B8549, 0xF6A9886555FBF66B, 0x6E31616D7F33E25E, 0x36E31B7426E3049D,
  0x4F8E4FAF46A13F5F, 0x03EB0CB3253F819F, 0x636A7769905770D2, 0x3ADF3781D16D1148,
  0x92D19CB1818BC9C2, 0x283E68F4D459C533, 0xFA83A8A88DECAA04, 0x8C6F00368EAC538C,
  0x7B66B0CF3797B322, 0x5131E122FDABA3FF, 0x6E59FF515C08C7A9, 0xBA2C5269B2C377B0,
  0xA9D24FD368FE8A2B, 0x22DB13D32E33E891, 0x7B97DFC804B876E5, 0xC598BDFCD0E834F9,
  0xB256163D3687F5A7, 0x66D7A73C6AEF50B3, 0xE810F88E85CEA11A, 0x4814F8F3B83E4394,
  0x9CABA22D10A2F690, 0x0D10032511F58111, 0xE9A36EF5EEA3CD58, 0xC79242DE194D9D7C,
  0xC3871AA0435EE5C8, 0x52890BED43CCF4CD, 0x07A1D0861ACCD373, 0x227B816FF0FEE9ED,
  0x59FFBF73AACFC0C4, 0x09AB564F2BEDAD0C, 0xC05F744F2EE38318, 0x7B50B621D547C661,
  0x0C1F71CB4E68E5D1, 0x0E33A47881D4DBAA, 0xF5C3BF198E9A7C2E, 0x16328FD8C0F68A91,
  0xA3E399C9AB3E9A59, 0x163AE71CBCBB18B8, 0x18F17E4A8C79F7AB, 0x9250E2EA37014B45,
  0x7BBBB111D60B03E4, 0x3DAA4A3071A0BD88, 0xA28828D790A2D6DC, 0xBC70FC88F64BE3F1,
  0xA3E48008BA4333C7, 0x739E435ACAFC79F7, 0x42BBB360BE007CC6, 0x4FFB6FD2AF74EC92,
  0x2A799A2994673146, 0xBE0A045B69D48E9F, 0x549432F54FC6A278, 0x371D3C60369FC702,
  0xDB4557D415B08CA7, 0xE8692F0A83850B37, 0x022E46AEB36E9AAB, 0x117AC9B814E4652D,
  0xA361041267AE9048, 0x277CB51C961C3DDA, 0xAFFC96F377CB8A8D, 0x83CC79FA01DD1BA7,
  0xA494842ACF4B802C, 0xFC6D9CDDE2C34A3F, 0x4ED6863CE455F7A7, 0x630914D0DB7AAE98
};
#endif /* T1HA0_AESNI_AVAILABLE */

/* *INDENT-ON* */
/* clang-format on */

__cold int t1ha_selfcheck__t1ha0_32le(void) {
  return t1ha_selfcheck(t1ha0_32le, t1ha_refval_32le);
}

__cold int t1ha_selfcheck__t1ha0_32be(void) {
  return t1ha_selfcheck(t1ha0_32be, t1ha_refval_32be);
}

#if T1HA0_AESNI_AVAILABLE
__cold int t1ha_selfcheck__t1ha0_ia32aes_noavx(void) {
  return t1ha_selfcheck(t1ha0_ia32aes_noavx, t1ha_refval_ia32aes_a);
}

__cold int t1ha_selfcheck__t1ha0_ia32aes_avx(void) {
  return t1ha_selfcheck(t1ha0_ia32aes_avx, t1ha_refval_ia32aes_a);
}

#ifndef __e2k__
__cold int t1ha_selfcheck__t1ha0_ia32aes_avx2(void) {
  return t1ha_selfcheck(t1ha0_ia32aes_avx2, t1ha_refval_ia32aes_b);
}
#endif /* ! __e2k__ */
#endif /* if T1HA0_AESNI_AVAILABLE */

__cold int t1ha_selfcheck__t1ha0(void) {
  int rc = t1ha_selfcheck__t1ha0_32le() | t1ha_selfcheck__t1ha0_32be();

#if (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) &&                \
    (!defined(T1HA1_DISABLED) || !defined(T1HA2_DISABLED))
#if defined(T1HA1_DISABLED)
  rc |= t1ha_selfcheck__t1ha2();
#else
  rc |= t1ha_selfcheck__t1ha1();
#endif /* T1HA1_DISABLED */
#endif /* 32/64 */

#if T1HA0_AESNI_AVAILABLE
#ifdef __e2k__
  rc |= t1ha_selfcheck__t1ha0_ia32aes_noavx();
  rc |= t1ha_selfcheck__t1ha0_ia32aes_avx();
#else
  uint64_t features = t1ha_ia32cpu_features();
  if (t1ha_ia32_AESNI_avail(features)) {
    rc |= t1ha_selfcheck__t1ha0_ia32aes_noavx();
    if (t1ha_ia32_AVX_avail(features)) {
      rc |= t1ha_selfcheck__t1ha0_ia32aes_avx();
      if (t1ha_ia32_AVX2_avail(features))
        rc |= t1ha_selfcheck__t1ha0_ia32aes_avx2();
    }
  }
#endif /* __e2k__ */
#endif /* T1HA0_AESNI_AVAILABLE */

  return rc;
}

#endif /* T1HA0_DISABLED */


================================================
FILE: benchmarks/t1ha/src/t1ha1.c
================================================
/*
 *  Copyright (c) 2016-2020 Positive Technologies, https://www.ptsecurity.com,
 *  Fast Positive Hash.
 *
 *  Portions Copyright (c) 2010-2020 Leonid Yuriev <leo@yuriev.ru>,
 *  The 1Hippeus project (t1h).
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty. In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgement in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 */

/*
 * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
 * by [Positive Technologies](https://www.ptsecurity.ru)
 *
 * Briefly, it is a 64-bit Hash Function:
 *  1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
 *     but portable and without penalties it can run on any 64-bit CPU.
 *  2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
 *     and all others portable hash-functions (which do not use specific
 *     hardware tricks).
 *  3. Not suitable for cryptography.
 *
 * The Future will (be) Positive. Всё будет хорошо.
 *
 * ACKNOWLEDGEMENT:
 * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
 * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
 */

#ifndef T1HA1_DISABLED
#include "t1ha_bits.h"
#include "t1ha_selfcheck.h"

/* xor-mul-xor mixer */
static __inline uint64_t mix64(uint64_t v, uint64_t p) {
  v *= p;
  return v ^ rot64(v, 41);
}

static __inline uint64_t final_weak_avalanche(uint64_t a, uint64_t b) {
  /* LY: for performance reason on a some not high-end CPUs
   * I replaced the second mux64() operation by mix64().
   * Unfortunately this approach fails the "strict avalanche criteria",
   * see test results at https://github.com/demerphq/smhasher. */
  return mux64(rot64(a + b, 17), prime_4) + mix64(a ^ b, prime_0);
}

/* TODO: C++ template in the next version */
#define T1HA1_BODY(ENDIANNES, ALIGNESS)                                        \
  const uint64_t *v = (const uint64_t *)data;                                  \
  if (unlikely(len > 32)) {                                                    \
    uint64_t c = rot64(len, 17) + seed;                                        \
    uint64_t d = len ^ rot64(seed, 17);                                        \
    const uint64_t *detent =                                                   \
        (const uint64_t *)((const uint8_t *)data + len - 31);                  \
    do {                                                                       \
      const uint64_t w0 = fetch64_##ENDIANNES##_##ALIGNESS(v + 0);             \
      const uint64_t w1 = fetch64_##ENDIANNES##_##ALIGNESS(v + 1);             \
      const uint64_t w2 = fetch64_##ENDIANNES##_##ALIGNESS(v + 2);             \
      const uint64_t w3 = fetch64_##ENDIANNES##_##ALIGNESS(v + 3);             \
      v += 4;                                                                  \
      prefetch(v);                                                             \
                                                                               \
      const uint64_t d02 = w0 ^ rot64(w2 + d, 17);                             \
      const uint64_t c13 = w1 ^ rot64(w3 + c, 17);                             \
      d -= b ^ rot64(w1, 31);                                                  \
      c += a ^ rot64(w0, 41);                                                  \
      b ^= prime_0 * (c13 + w2);                                               \
      a ^= prime_1 * (d02 + w3);                                               \
    } while (likely(v < detent));                                              \
                                                                               \
    a ^= prime_6 * (rot64(c, 17) + d);                                         \
    b ^= prime_5 * (c + rot64(d, 17));                                         \
    len &= 31;                                                                 \
  }                                                                            \
                                                                               \
  switch (len) {                                                               \
  default:                                                                     \
    b += mux64(fetch64_##ENDIANNES##_##ALIGNESS(v++), prime_4);                \
  /* fall through */                                                           \
  case 24:                                                                     \
  case 23:                                                                     \
  case 22:                                                                     \
  case 21:                                                                     \
  case 20:                                                                     \
  case 19:                                                                     \
  case 18:                                                                     \
  case 17:                                                                     \
    a += mux64(fetch64_##ENDIANNES##_##ALIGNESS(v++), prime_3);                \
  /* fall through */                                                           \
  case 16:                                                                     \
  case 15:                                                                     \
  case 14:                                                                     \
  case 13:                                                                     \
  case 12:                                                                     \
  case 11:                                                                     \
  case 10:                                                                     \
  case 9:                                                                      \
    b += mux64(fetch64_##ENDIANNES##_##ALIGNESS(v++), prime_2);                \
  /* fall through */                                                           \
  case 8:                                                                      \
  case 7:                                                                      \
  case 6:                                                                      \
  case 5:                                                                      \
  case 4:                                                                      \
  case 3:                                                                      \
  case 2:                                                                      \
  case 1:                                                                      \
    a += mux64(tail64_##ENDIANNES##_##ALIGNESS(v, len), prime_1);              \
  /* fall through */                                                           \
  case 0:                                                                      \
    return final_weak_avalanche(a, b);                                         \
  }

uint64_t t1ha1_le(const void *data, size_t len, uint64_t seed) {
  uint64_t a = seed;
  uint64_t b = len;

#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT
  T1HA1_BODY(le, unaligned);
#else
  const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0;
  if (misaligned) {
    T1HA1_BODY(le, unaligned);
  } else {
    T1HA1_BODY(le, aligned);
  }
#endif
}

uint64_t t1ha1_be(const void *data, size_t len, uint64_t seed) {
  uint64_t a = seed;
  uint64_t b = len;

#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT
  T1HA1_BODY(be, unaligned);
#else
  const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0;
  if (misaligned) {
    T1HA1_BODY(be, unaligned);
  } else {
    T1HA1_BODY(be, aligned);
  }
#endif
}

#endif /* T1HA1_DISABLED */


================================================
FILE: benchmarks/t1ha/src/t1ha1_selfcheck.c
================================================
/*
 *  Copyright (c) 2016-2020 Positive Technologies, https://www.ptsecurity.com,
 *  Fast Positive Hash.
 *
 *  Portions Copyright (c) 2010-2020 Leonid Yuriev <leo@yuriev.ru>,
 *  The 1Hippeus project (t1h).
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty. In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgement in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 */

/*
 * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
 * by [Positive Technologies](https://www.ptsecurity.ru)
 *
 * Briefly, it is a 64-bit Hash Function:
 *  1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
 *     but portable and without penalties it can run on any 64-bit CPU.
 *  2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
 *     and all others portable hash-functions (which do not use specific
 *     hardware tricks).
 *  3. Not suitable for cryptography.
 *
 * The Future will (be) Positive. Всё будет хорошо.
 *
 * ACKNOWLEDGEMENT:
 * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
 * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
 */

#ifndef T1HA1_DISABLED
#include "t1ha_bits.h"
#include "t1ha_selfcheck.h"

/* *INDENT-OFF* */
/* clang-format off */

const uint64_t t1ha_refval_64le[81] = { 0,
  0x6A580668D6048674, 0xA2FE904AFF0D0879, 0xE3AB9C06FAF4D023, 0x6AF1C60874C95442,
  0xB3557E561A6C5D82, 0x0AE73C696F3D37C0, 0x5EF25F7062324941, 0x9B784F3B4CE6AF33,
  0x6993BB206A74F070, 0xF1E95DF109076C4C, 0x4E1EB70C58E48540, 0x5FDD7649D8EC44E4,
  0x559122C706343421, 0x380133D58665E93D, 0x9CE74296C8C55AE4, 0x3556F9A5757AB6D0,
  0xF62751F7F25C469E, 0x851EEC67F6516D94, 0xED463EE3848A8695, 0xDC8791FEFF8ED3AC,
  0x2569C744E1A282CF, 0xF90EB7C1D70A80B9, 0x68DFA6A1B8050A4C, 0x94CCA5E8210D2134,
  0xF5CC0BEABC259F52, 0x40DBC1F51618FDA7, 0x0807945BF0FB52C6, 0xE5EF7E09DE70848D,
  0x63E1DF35FEBE994A, 0x2025E73769720D5A, 0xAD6120B2B8A152E1, 0x2A71D9F13959F2B7,
  0x8A20849A27C32548, 0x0BCBC9FE3B57884E, 0x0E028D255667AEAD, 0xBE66DAD3043AB694,
  0xB00E4C1238F9E2D4, 0x5C54BDE5AE280E82, 0x0E22B86754BC3BC4, 0x016707EBF858B84D,
  0x990015FBC9E095EE, 0x8B9AF0A3E71F042F, 0x6AA56E88BD380564, 0xAACE57113E681A0F,
  0x19F81514AFA9A22D, 0x80DABA3D62BEAC79, 0x715210412CABBF46, 0xD8FA0B9E9D6AA93F,
  0x6C2FC5A4109FD3A2, 0x5B3E60EEB51DDCD8, 0x0A7C717017756FE7, 0xA73773805CA31934,
  0x4DBD6BB7A31E85FD, 0x24F619D3D5BC2DB4, 0x3E4AF35A1678D636, 0x84A1A8DF8D609239,
  0x359C862CD3BE4FCD, 0xCF3A39F5C27DC125, 0xC0FF62F8FD5F4C77, 0x5E9F2493DDAA166C,
  0x17424152BE1CA266, 0xA78AFA5AB4BBE0CD, 0x7BFB2E2CEF118346, 0x647C3E0FF3E3D241,
  0x0352E4055C13242E, 0x6F42FC70EB660E38, 0x0BEBAD4FABF523BA, 0x9269F4214414D61D,
  0x1CA8760277E6006C, 0x7BAD25A859D87B5D, 0xAD645ADCF7414F1D, 0xB07F517E88D7AFB3,
  0xB321C06FB5FFAB5C, 0xD50F162A1EFDD844, 0x1DFD3D1924FBE319, 0xDFAEAB2F09EF7E78,
  0xA7603B5AF07A0B1E, 0x41CD044C0E5A4EE3, 0xF64D2F86E813BF33, 0xFF9FDB99305EB06A
};

const uint64_t t1ha_refval_64be[81] = { 0,
  0x6A580668D6048674, 0xDECC975A0E3B8177, 0xE3AB9C06FAF4D023, 0xE401FA8F1B6AF969,
  0x67DB1DAE56FB94E3, 0x1106266A09B7A073, 0x550339B1EF2C7BBB, 0x290A2BAF590045BB,
  0xA182C1258C09F54A, 0x137D53C34BE7143A, 0xF6D2B69C6F42BEDC, 0x39643EAF2CA2E4B4,
  0x22A81F139A2C9559, 0x5B3D6AEF0AF33807, 0x56E3F80A68643C08, 0x9E423BE502378780,
  0xCDB0986F9A5B2FD5, 0xD5B3C84E7933293F, 0xE5FB8C90399E9742, 0x5D393C1F77B2CF3D,
  0xC8C82F5B2FF09266, 0xACA0230CA6F7B593, 0xCB5805E2960D1655, 0x7E2AD5B704D77C95,
  0xC5E903CDB8B9EB5D, 0x4CC7D0D21CC03511, 0x8385DF382CFB3E93, 0xF17699D0564D348A,
  0xF77EE7F8274A4C8D, 0xB9D8CEE48903BABE, 0xFE0EBD2A82B9CFE9, 0xB49FB6397270F565,
  0x173735C8C342108E, 0xA37C7FBBEEC0A2EA, 0xC13F66F462BB0B6E, 0x0C04F3C2B551467E,
  0x76A9CB156810C96E, 0x2038850919B0B151, 0xCEA19F2B6EED647B, 0x6746656D2FA109A4,
  0xF05137F221007F37, 0x892FA9E13A3B4948, 0x4D57B70D37548A32, 0x1A7CFB3D566580E6,
  0x7CB30272A45E3FAC, 0x137CCFFD9D51423F, 0xB87D96F3B82DF266, 0x33349AEE7472ED37,
  0x5CC0D3C99555BC07, 0x4A8F4FA196D964EF, 0xE82A0D64F281FBFA, 0x38A1BAC2C36823E1,
  0x77D197C239FD737E, 0xFB07746B4E07DF26, 0xC8A2198E967672BD, 0x5F1A146D143FA05A,
  0x26B877A1201AB7AC, 0x74E5B145214723F8, 0xE9CE10E3C70254BC, 0x299393A0C05B79E8,
  0xFD2D2B9822A5E7E2, 0x85424FEA50C8E50A, 0xE6839E714B1FFFE5, 0x27971CCB46F9112A,
  0xC98695A2E0715AA9, 0x338E1CBB4F858226, 0xFC6B5C5CF7A8D806, 0x8973CAADDE8DA50C,
  0x9C6D47AE32EBAE72, 0x1EBF1F9F21D26D78, 0x80A9704B8E153859, 0x6AFD20A939F141FB,
  0xC35F6C2B3B553EEF, 0x59529E8B0DC94C1A, 0x1569DF036EBC4FA1, 0xDA32B88593C118F9,
  0xF01E4155FF5A5660, 0x765A2522DCE2B185, 0xCEE95554128073EF, 0x60F072A5CA51DE2F
};

/* *INDENT-ON* */
/* clang-format on */

__cold int t1ha_selfcheck__t1ha1_le(void) {
  return t1ha_selfcheck(t1ha1_le, t1ha_refval_64le);
}

__cold int t1ha_selfcheck__t1ha1_be(void) {
  return t1ha_selfcheck(t1ha1_be, t1ha_refval_64be);
}

__cold int t1ha_selfcheck__t1ha1(void) {
  return t1ha_selfcheck__t1ha1_le() | t1ha_selfcheck__t1ha1_be();
}

#endif /* T1HA1_DISABLED */


================================================
FILE: benchmarks/t1ha/src/t1ha2.c
================================================
/*
 *  Copyright (c) 2016-2020 Positive Technologies, https://www.ptsecurity.com,
 *  Fast Positive Hash.
 *
 *  Portions Copyright (c) 2010-2020 Leonid Yuriev <leo@yuriev.ru>,
 *  The 1Hippeus project (t1h).
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty. In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgement in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 */

/*
 * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
 * by [Positive Technologies](https://www.ptsecurity.ru)
 *
 * Briefly, it is a 64-bit Hash Function:
 *  1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
 *     but portable and without penalties it can run on any 64-bit CPU.
 *  2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
 *     and all others portable hash-functions (which do not use specific
 *     hardware tricks).
 *  3. Not suitable for cryptography.
 *
 * The Future will (be) Positive. Всё будет хорошо.
 *
 * ACKNOWLEDGEMENT:
 * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
 * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
 */

#ifndef T1HA2_DISABLED
#include "t1ha_bits.h"
#include "t1ha_selfcheck.h"

static __always_inline void init_ab(t1ha_state256_t *s, uint64_t x,
                                    uint64_t y) {
  s->n.a = x;
  s->n.b = y;
}

static __always_inline void init_cd(t1ha_state256_t *s, uint64_t x,
                                    uint64_t y) {
  s->n.c = rot64(y, 23) + ~x;
  s->n.d = ~y + rot64(x, 19);
}

/* TODO: C++ template in the next version */
#define T1HA2_UPDATE(ENDIANNES, ALIGNESS, state, v)                            \
  do {                                                                         \
    t1ha_state256_t *const s = state;                                          \
    const uint64_t w0 = fetch64_##ENDIANNES##_##ALIGNESS(v + 0);               \
    const uint64_t w1 = fetch64_##ENDIANNES##_##ALIGNESS(v + 1);               \
    const uint64_t w2 = fetch64_##ENDIANNES##_##ALIGNESS(v + 2);               \
    const uint64_t w3 = fetch64_##ENDIANNES##_##ALIGNESS(v + 3);               \
                                                                               \
    const uint64_t d02 = w0 + rot64(w2 + s->n.d, 56);                          \
    const uint64_t c13 = w1 + rot64(w3 + s->n.c, 19);                          \
    s->n.d ^= s->n.b + rot64(w1, 38);                                          \
    s->n.c ^= s->n.a + rot64(w0, 57);                                          \
    s->n.b ^= prime_6 * (c13 + w2);                                            \
    s->n.a ^= prime_5 * (d02 + w3);                                            \
  } while (0)

static __always_inline void squash(t1ha_state256_t *s) {
  s->n.a ^= prime_6 * (s->n.c + rot64(s->n.d, 23));
  s->n.b ^= prime_5 * (rot64(s->n.c, 19) + s->n.d);
}

/* TODO: C++ template in the next version */
#define T1HA2_LOOP(ENDIANNES, ALIGNESS, state, data, len)                      \
  do {                                                                         \
    const void *detent = (const uint8_t *)data + len - 31;                     \
    do {                                                                       \
      const uint64_t *v = (const uint64_t *)data;                              \
      data = (const uint64_t *)data + 4;                                       \
      prefetch(data);                                                          \
      T1HA2_UPDATE(le, ALIGNESS, state, v);                                    \
    } while (likely(data < detent));                                           \
  } while (0)

/* TODO: C++ template in the next version */
#define T1HA2_TAIL_AB(ENDIANNES, ALIGNESS, state, data, len)                   \
  do {                                                                         \
    t1ha_state256_t *const s = state;                                          \
    const uint64_t *v = (const uint64_t *)data;                                \
    switch (len) {                                                             \
    default:                                                                   \
      mixup64(&s->n.a, &s->n.b, fetch64_##ENDIANNES##_##ALIGNESS(v++),         \
              prime_4);                                                        \
    /* fall through */                                                         \
    case 24:                                                                   \
    case 23:                                                                   \
    case 22:                                                                   \
    case 21:                                                                   \
    case 20:                                                                   \
    case 19:                                                                   \
    case 18:                                                                   \
    case 17:                                                                   \
      mixup64(&s->n.b, &s->n.a, fetch64_##ENDIANNES##_##ALIGNESS(v++),         \
              prime_3);                                                        \
    /* fall through */                                                         \
    case 16:                                                                   \
    case 15:                                                                   \
    case 14:                                                                   \
    case 13:                                                                   \
    case 12:                                                                   \
    case 11:                                                                   \
    case 10:                                                                   \
    case 9:                                                                    \
      mixup64(&s->n.a, &s->n.b, fetch64_##ENDIANNES##_##ALIGNESS(v++),         \
              prime_2);                                                        \
    /* fall through */                                                         \
    case 8:                                                                    \
    case 7:                                                                    \
    case 6:                                                                    \
    case 5:                                                                    \
    case 4:                                                                    \
    case 3:                                                                    \
    case 2:                                                                    \
    case 1:                                                                    \
      mixup64(&s->n.b, &s->n.a, tail64_##ENDIANNES##_##ALIGNESS(v, len),       \
              prime_1);                                                        \
    /* fall through */                                                         \
    case 0:                                                                    \
      return final64(s->n.a, s->n.b);                                          \
    }                                                                          \
  } while (0)

/* TODO: C++ template in the next version */
#define T1HA2_TAIL_ABCD(ENDIANNES, ALIGNESS, state, data, len)                 \
  do {                                                                         \
    t1ha_state256_t *const s = state;                                          \
    const uint64_t *v = (const uint64_t *)data;                                \
    switch (len) {                                                             \
    default:                                                                   \
      mixup64(&s->n.a, &s->n.d, fetch64_##ENDIANNES##_##ALIGNESS(v++),         \
              prime_4);                                                        \
    /* fall through */                                                         \
    case 24:                                                                   \
    case 23:                                                                   \
    case 22:                                                                   \
    case 21:                                                                   \
    case 20:                                                                   \
    case 19:                                                                   \
    case 18:                                                                   \
    case 17:                                                                   \
      mixup64(&s->n.b, &s->n.a, fetch64_##ENDIANNES##_##ALIGNESS(v++),         \
              prime_3);                                                        \
    /* fall through */                                                         \
    case 16:                                                                   \
    case 15:                                                                   \
    case 14:                                                                   \
    case 13:                                                                   \
    case 12:                                                                   \
    case 11:                                                                   \
    case 10:                                                                   \
    case 9:                                                                    \
      mixup64(&s->n.c, &s->n.b, fetch64_##ENDIANNES##_##ALIGNESS(v++),         \
              prime_2);                                                        \
    /* fall through */                                                         \
    case 8:                                                                    \
    case 7:                                                                    \
    case 6:                                                                    \
    case 5:                                                                    \
    case 4:                                                                    \
    case 3:                                                                    \
    case 2:                                                                    \
    case 1:                                                                    \
      mixup64(&s->n.d, &s->n.c, tail64_##ENDIANNES##_##ALIGNESS(v, len),       \
              prime_1);                                                        \
    /* fall through */                                                         \
    case 0:                                                                    \
      return final128(s->n.a, s->n.b, s->n.c, s->n.d, extra_result);           \
    }                                                                          \
  } while (0)

static __always_inline uint64_t final128(uint64_t a, uint64_t b, uint64_t c,
                                         uint64_t d, uint64_t *h) {
  mixup64(&a, &b, rot64(c, 41) ^ d, prime_0);
  mixup64(&b, &c, rot64(d, 23) ^ a, prime_6);
  mixup64(&c, &d, rot64(a, 19) ^ b, prime_5);
  mixup64(&d, &a, rot64(b, 31) ^ c, prime_4);
  *h = c + d;
  return a ^ b;
}

//------------------------------------------------------------------------------

uint64_t t1ha2_atonce(const void *data, size_t length, uint64_t seed) {
  t1ha_state256_t state;
  init_ab(&state, seed, length);

#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT
  if (unlikely(length > 32)) {
    init_cd(&state, seed, length);
#if defined(__LCC__) && __LCC__ > 123
/* Форсирует комбинирование пар арифметических операций в двухэтажные операции
 * в ближайшем после объявления директивы цикле, даже если эвристики оптимизации
 * говорят, что это нецелесообразно */
#pragma comb_oper
#endif /* E2K LCC > 1.23 */
    T1HA2_LOOP(le, unaligned, &state, data, length);
    squash(&state);
    length &= 31;
  }
  T1HA2_TAIL_AB(le, unaligned, &state, data, length);
#else
  const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0;
  if (misaligned) {
    if (unlikely(length > 32)) {
      init_cd(&state, seed, length);
#if defined(__LCC__) && __LCC__ > 123
/* Форсирует комбинирование пар арифметических операций в двухэтажные операции
 * в ближайшем после объявления директивы цикле, даже если эвристики оптимизации
 * говорят, что это нецелесообразно */
#pragma comb_oper
#endif /* E2K LCC > 1.23 */
      T1HA2_LOOP(le, unaligned, &state, data, length);
      squash(&state);
      length &= 31;
    }
    T1HA2_TAIL_AB(le, unaligned, &state, data, length);
  } else {
    if (unlikely(length > 32)) {
      init_cd(&state, seed, length);
#if defined(__LCC__) && __LCC__ > 123
/* Форсирует комбинирование пар арифметических операций в двухэтажные операции
 * в ближайшем после объявления директивы цикле, даже если эвристики оптимизации
 * говорят, что это нецелесообразно */
#pragma comb_oper
#endif /* E2K LCC > 1.23 */
      T1HA2_LOOP(le, aligned, &state, data, length);
      squash(&state);
      length &= 31;
    }
    T1HA2_TAIL_AB(le, aligned, &state, data, length);
  }
#endif
}

uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result,
                         const void *__restrict data, size_t length,
                         uint64_t seed) {
  t1ha_state256_t state;
  init_ab(&state, seed, length);
  init_cd(&state, seed, length);

#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT
  if (unlikely(length > 32)) {
#if defined(__LCC__) && __LCC__ > 123
/* Форсирует комбинирование пар арифметических операций в двухэтажные операции
 * в ближайшем после объявления директивы цикле, даже если эвристики оптимизации
 * говорят, что это нецелесообразно */
#pragma comb_oper
#endif /* E2K LCC > 1.23 */
    T1HA2_LOOP(le, unaligned, &state, data, length);
    length &= 31;
  }
  T1HA2_TAIL_ABCD(le, unaligned, &state, data, length);
#else
  const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0;
  if (misaligned) {
    if (unlikely(length > 32)) {
#if defined(__LCC__) && __LCC__ > 123
/* Форсирует комбинирование пар арифметических операций в двухэтажные операции
 * в ближайшем после объявления директивы цикле, даже если эвристики оптимизации
 * говорят, что это нецелесообразно */
#pragma comb_oper
#endif /* E2K LCC > 1.23 */
      T1HA2_LOOP(le, unaligned, &state, data, length);
      length &= 31;
    }
    T1HA2_TAIL_ABCD(le, unaligned, &state, data, length);
  } else {
    if (unlikely(length > 32)) {
#if defined(__LCC__) && __LCC__ > 123
/* Форсирует комбинирование пар арифметических операций в двухэтажные операции
 * в ближайшем после объявления директивы цикле, даже если эвристики оптимизации
 * говорят, что это нецелесообразно */
#pragma comb_oper
#endif /* E2K LCC > 1.23 */
      T1HA2_LOOP(le, aligned, &state, data, length);
      length &= 31;
    }
    T1HA2_TAIL_ABCD(le, aligned, &state, data, length);
  }
#endif
}

//------------------------------------------------------------------------------

void t1ha2_init(t1ha_context_t *ctx, uint64_t seed_x, uint64_t seed_y) {
  init_ab(&ctx->state, seed_x, seed_y);
  init_cd(&ctx->state, seed_x, seed_y);
  ctx->partial = 0;
  ctx->total = 0;
}

void t1ha2_update(t1ha_context_t *__restrict ctx, const void *__restrict data,
                  size_t length) {
  ctx->total += length;

  if (ctx->partial) {
    const size_t left = 32 - ctx->partial;
    const size_t chunk = (length >= left) ? left : length;
    memcpy(ctx->buffer.bytes + ctx->partial, data, chunk);
    ctx->partial += chunk;
    if (ctx->partial < 32) {
      assert(left >= length);
      return;
    }
    ctx->partial = 0;
    data = (const uint8_t *)data + chunk;
    length -= chunk;
    T1HA2_UPDATE(le, aligned, &ctx->state, ctx->buffer.u64);
  }

  if (length >= 32) {
#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT
#if defined(__LCC__) && __LCC__ > 123
/* Форсирует комбинирование пар арифметических операций в двухэтажные операции
 * в ближайшем после объявления директивы цикле, даже если эвристики оптимизации
 * говорят, что это нецелесообразно */
#pragma comb_oper
#endif /* E2K LCC > 1.23 */
    T1HA2_LOOP(le, unaligned, &ctx->state, data, length);
#else
    const bool misaligned = (((uintptr_t)data) & (ALIGNMENT_64 - 1)) != 0;
    if (misaligned) {
#if defined(__LCC__) && __LCC__ > 123
/* Форсирует комбинирование пар арифметических операций в двухэтажные операции
 * в ближайшем после объявления директивы цикле, даже если эвристики оптимизации
 * говорят, что это нецелесообразно */
#pragma comb_oper
#endif /* E2K LCC > 1.23 */
      T1HA2_LOOP(le, unaligned, &ctx->state, data, length);
    } else {
#if defined(__LCC__) && __LCC__ > 123
/* Форсирует комбинирование пар арифметических операций в двухэтажные операции
 * в ближайшем после объявления директивы цикле, даже если эвристики оптимизации
 * говорят, что это нецелесообразно */
#pragma comb_oper
#endif /* E2K LCC > 1.23 */
      T1HA2_LOOP(le, aligned, &ctx->state, data, length);
    }
#endif
    length &= 31;
  }

  if (length)
    memcpy(ctx->buffer.bytes, data, ctx->partial = length);
}

uint64_t t1ha2_final(t1ha_context_t *__restrict ctx,
                     uint64_t *__restrict extra_result) {
  uint64_t bits = (ctx->total << 3) ^ (UINT64_C(1) << 63);
#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
  bits = bswap64(bits);
#endif
  t1ha2_update(ctx, &bits, 8);

  if (likely(!extra_result)) {
    squash(&ctx->state);
    T1HA2_TAIL_AB(le, aligned, &ctx->state, ctx->buffer.u64, ctx->partial);
  }

  T1HA2_TAIL_ABCD(le, aligned, &ctx->state, ctx->buffer.u64, ctx->partial);
}

#endif /* T1HA2_DISABLED */


================================================
FILE: benchmarks/t1ha/src/t1ha2_selfcheck.c
================================================
/*
 *  Copyright (c) 2016-2020 Positive Technologies, https://www.ptsecurity.com,
 *  Fast Positive Hash.
 *
 *  Portions Copyright (c) 2010-2020 Leonid Yuriev <leo@yuriev.ru>,
 *  The 1Hippeus project (t1h).
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty. In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgement in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 */

/*
 * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
 * by [Positive Technologies](https://www.ptsecurity.ru)
 *
 * Briefly, it is a 64-bit Hash Function:
 *  1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
 *     but portable and without penalties it can run on any 64-bit CPU.
 *  2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
 *     and all others portable hash-functions (which do not use specific
 *     hardware tricks).
 *  3. Not suitable for cryptography.
 *
 * The Future will (be) Positive. Всё будет хорошо.
 *
 * ACKNOWLEDGEMENT:
 * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
 * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
 */

#ifndef T1HA2_DISABLED
#include "t1ha_bits.h"
#include "t1ha_selfcheck.h"

/* *INDENT-OFF* */
/* clang-format off */

const uint64_t t1ha_refval_2atonce[81] = { 0,
  0x772C7311BE32FF42, 0x444753D23F207E03, 0x71F6DF5DA3B4F532, 0x555859635365F660,
  0xE98808F1CD39C626, 0x2EB18FAF2163BB09, 0x7B9DD892C8019C87, 0xE2B1431C4DA4D15A,
  0x1984E718A5477F70, 0x08DD17B266484F79, 0x4C83A05D766AD550, 0x92DCEBB131D1907D,
  0xD67BC6FC881B8549, 0xF6A9886555FBF66B, 0x6E31616D7F33E25E, 0x36E31B7426E3049D,
  0x4F8E4FAF46A13F5F, 0x03EB0CB3253F819F, 0x636A7769905770D2, 0x3ADF3781D16D1148,
  0x92D19CB1818BC9C2, 0x283E68F4D459C533, 0xFA83A8A88DECAA04, 0x8C6F00368EAC538C,
  0x7B66B0CF3797B322, 0x5131E122FDABA3FF, 0x6E59FF515C08C7A9, 0xBA2C5269B2C377B0,
  0xA9D24FD368FE8A2B, 0x22DB13D32E33E891, 0x7B97DFC804B876E5, 0xC598BDFCD0E834F9,
  0xB256163D3687F5A7, 0x66D7A73C6AEF50B3, 0x25A7201C85D9E2A3, 0x911573EDA15299AA,
  0x5C0062B669E18E4C, 0x17734ADE08D54E28, 0xFFF036E33883F43B, 0xFE0756E7777DF11E,
  0x37972472D023F129, 0x6CFCE201B55C7F57, 0xE019D1D89F02B3E1, 0xAE5CC580FA1BB7E6,
  0x295695FB7E59FC3A, 0x76B6C820A40DD35E, 0xB1680A1768462B17, 0x2FB6AF279137DADA,
  0x28FB6B4366C78535, 0xEC278E53924541B1, 0x164F8AAB8A2A28B5, 0xB6C330AEAC4578AD,
  0x7F6F371070085084, 0x94DEAD60C0F448D3, 0x99737AC232C559EF, 0x6F54A6F9CA8EDD57,
  0x979B01E926BFCE0C, 0xF7D20BC85439C5B4, 0x64EDB27CD8087C12, 0x11488DE5F79C0BE2,
  0x25541DDD1680B5A4, 0x8B633D33BE9D1973, 0x404A3113ACF7F6C6, 0xC59DBDEF8550CD56,
  0x039D23C68F4F992C, 0x5BBB48E4BDD6FD86, 0x41E312248780DF5A, 0xD34791CE75D4E94F,
  0xED523E5D04DCDCFF, 0x7A6BCE0B6182D879, 0x21FB37483CAC28D8, 0x19A1B66E8DA878AD,
  0x6F804C5295B09ABE, 0x2A4BE5014115BA81, 0xA678ECC5FC924BE0, 0x50F7A54A99A36F59,
  0x0FD7E63A39A66452, 0x5AB1B213DD29C4E4, 0xF3ED80D9DF6534C5, 0xC736B12EF90615FD
};

const uint64_t t1ha_refval_2atonce128[81] = { 0x4EC7F6A48E33B00A,
  0xB7B7FAA5BD7D8C1E, 0x3269533F66534A76, 0x6C3EC6B687923BFC, 0xC096F5E7EFA471A9,
  0x79D8AFB550CEA471, 0xCEE0507A20FD5119, 0xFB04CFFC14A9F4BF, 0xBD4406E923807AF2,
  0x375C02FF11010491, 0xA6EA4C2A59E173FF, 0xE0A606F0002CADDF, 0xE13BEAE6EBC07897,
  0xF069C2463E48EA10, 0x75BEE1A97089B5FA, 0x378F22F8DE0B8085, 0x9C726FC4D53D0D8B,
  0x71F6130A2D08F788, 0x7A9B20433FF6CF69, 0xFF49B7CD59BF6D61, 0xCCAAEE0D1CA9C6B3,
  0xC77889D86039D2AD, 0x7B378B5BEA9B0475, 0x6520BFA79D59AD66, 0x2441490CB8A37267,
  0xA715A66B7D5CF473, 0x9AE892C88334FD67, 0xD2FFE9AEC1D2169A, 0x790B993F18B18CBB,
  0xA0D02FBCF6A7B1AD, 0xA90833E6F151D0C1, 0x1AC7AFA37BD79BE0, 0xD5383628B2881A24,
  0xE5526F9D63F9F8F1, 0xC1F165A01A6D1F4D, 0x6CCEF8FF3FCFA3F2, 0x2030F18325E6DF48,
  0x289207230E3FB17A, 0x077B66F713A3C4B9, 0x9F39843CAF871754, 0x512FDA0F808ACCF3,
  0xF4D9801CD0CD1F14, 0x28A0C749ED323638, 0x94844CAFA671F01C, 0xD0E261876B8ACA51,
  0x8FC2A648A4792EA2, 0x8EF87282136AF5FE, 0x5FE6A54A9FBA6B40, 0xA3CC5B8FE6223D54,
  0xA8C3C0DD651BB01C, 0x625E9FDD534716F3, 0x1AB2604083C33AC5, 0xDE098853F8692F12,
  0x4B0813891BD87624, 0x4AB89C4553D182AD, 0x92C15AA2A3C27ADA, 0xFF2918D68191F5D9,
  0x06363174F641C325, 0x667112ADA74A2059, 0x4BD605D6B5E53D7D, 0xF2512C53663A14C8,
  0x21857BCB1852667C, 0xAFBEBD0369AEE228, 0x7049340E48FBFD6B, 0x50710E1924F46954,
  0x869A75E04A976A3F, 0x5A41ABBDD6373889, 0xA781778389B4B188, 0x21A3AFCED6C925B6,
  0x107226192EC10B42, 0x62A862E84EC2F9B1, 0x2B15E91659606DD7, 0x613934D1F9EC5A42,
  0x4DC3A96DC5361BAF, 0xC80BBA4CB5F12903, 0x3E3EDAE99A7D6987, 0x8F97B2D55941DCB0,
  0x4C9787364C3E4EC1, 0xEF0A2D07BEA90CA7, 0x5FABF32C70AEEAFB, 0x3356A5CFA8F23BF4
};

const uint64_t t1ha_refval_2stream[81] = { 0x3C8426E33CB41606,
  0xFD74BE70EE73E617, 0xF43DE3CDD8A20486, 0x882FBCB37E8EA3BB, 0x1AA2CDD34CAA3D4B,
  0xEE755B2BFAE07ED5, 0xD4E225250D92E213, 0xA09B49083205965B, 0xD47B21724EF9EC9E,
  0xAC888FC3858CEE11, 0x94F820D85736F244, 0x1707951CCA920932, 0x8E0E45603F7877F0,
  0x9FD2592C0E3A7212, 0x9A66370F3AE3D427, 0xD33382D2161DE2B7, 0x9A35BE079DA7115F,
  0x73457C7FF58B4EC3, 0xBE8610BD53D7CE98, 0x65506DFE5CCD5371, 0x286A321AF9D5D9FA,
  0xB81EF9A7EF3C536D, 0x2CFDB5E6825C6E86, 0xB2A58CBFDFDD303A, 0xD26094A42B950635,
  0xA34D666A5F02AD9A, 0x0151E013EBCC72E5, 0x9254A6EA7FCB6BB5, 0x10C9361B3869DC2B,
  0xD7EC55A060606276, 0xA2FF7F8BF8976FFD, 0xB5181BB6852DCC88, 0x0EE394BB6178BAFF,
  0x3A8B4B400D21B89C, 0xEC270461970960FD, 0x615967FAB053877E, 0xFA51BF1CFEB4714C,
  0x29FDA8383070F375, 0xC3B663061BC52EDA, 0x192BBAF1F1A57923, 0x6D193B52F93C53AF,
  0x7F6F5639FE87CA1E, 0x69F7F9140B32EDC8, 0xD0F2416FB24325B6, 0x62C0E37FEDD49FF3,
  0x57866A4B809D373D, 0x9848D24BD935E137, 0xDFC905B66734D50A, 0x9A938DD194A68529,
  0x8276C44DF0625228, 0xA4B35D00AD67C0AB, 0x3D9CB359842DB452, 0x4241BFA8C23B267F,
  0x650FA517BEF15952, 0x782DE2ABD8C7B1E1, 0x4EAE456166CA3E15, 0x40CDF3A02614E337,
  0xAD84092C46102172, 0x0C68479B03F9A167, 0x7E1BA046749E181C, 0x3F3AB41A697382C1,
  0xC5E5DD6586EBFDC4, 0xFF926CD4EB02555C, 0x035CFE67F89E709B, 0x89F06AB6464A1B9D,
  0x8EFF58F3F7DEA758, 0x8B54AC657902089F, 0xC6C4F1F9F8DA4D64, 0xBDB729048AAAC93A,
  0xEA76BA628F5E5CD6, 0x742159B728B8A979, 0x6D151CD3C720E53D, 0xE97FFF9368FCDC42,
  0xCA5B38314914FBDA, 0xDD92C91D8B858EAE, 0x66E5F07CF647CBF2, 0xD4CF9B42F4985AFB,
  0x72AE17AC7D92F6B7, 0xB8206B22AB0472E1, 0x385876B5CFD42479, 0x03294A249EBE6B26
};

const uint64_t t1ha_refval_2stream128[81] = { 0xCD2801D3B92237D6,
  0x10E4D47BD821546D, 0x9100704B9D65CD06, 0xD6951CB4016313EF, 0x24DB636F96F474DA,
  0x3F4AF7DF3C49E422, 0xBFF25B8AF143459B, 0xA157EC13538BE549, 0xD3F5F52C47DBD419,
  0x0EF3D7D735AF1575, 0x46B7B892823F7B1B, 0xEE22EA4655213289, 0x56AD76F02FE929BC,
  0x9CF6CD1AC886546E, 0xAF45CE47AEA0B933, 0x535F9DC09F3996B7, 0x1F0C3C01694AE128,
  0x18495069BE0766F7, 0x37E5FFB3D72A4CB1, 0x6D6C2E9299F30709, 0x4F39E693F50B41E3,
  0xB11FC4EF0658E116, 0x48BFAACB78E5079B, 0xE1B4C89C781B3AD0, 0x81D2F34888D333A1,
  0xF6D02270D2EA449C, 0xC884C3C2C3CE1503, 0x711AE16BA157A9B9, 0x1E6140C642558C9D,
  0x35AB3D238F5DC55B, 0x33F07B6AEF051177, 0xE57336776EEFA71C, 0x6D445F8318BA3752,
  0xD4F5F6631934C988, 0xD5E260085727C4A2, 0x5B54B41EC180B4FA, 0x7F5D75769C15A898,
  0xAE5A6DB850CA33C6, 0x038CCB8044663403, 0xDA16310133DC92B8, 0x6A2FFB7AB2B7CE2B,
  0xDC1832D9229BAE20, 0x8C62C479F5ABC9E4, 0x5EB7B617857C9CCB, 0xB79CF7D749A1E80D,
  0xDE7FAC3798324FD3, 0x8178911813685D06, 0x6A726CBD394D4410, 0x6CBE6B3280DA1113,
  0x6829BA4410CF1148, 0xFA7E417EB26C5BC6, 0x22ED87884D6E3A49, 0x15F1472D5115669D,
  0x2EA0B4C8BF69D318, 0xDFE87070AA545503, 0x6B4C14B5F7144AB9, 0xC1ED49C06126551A,
  0x351919FC425C3899, 0x7B569C0FA6F1BD3E, 0x713AC2350844CFFD, 0xE9367F9A638C2FF3,
  0x97F17D325AEA0786, 0xBCB907CC6CF75F91, 0x0CB7517DAF247719, 0xBE16093CC45BE8A9,
  0x786EEE97359AD6AB, 0xB7AFA4F326B97E78, 0x2694B67FE23E502E, 0x4CB492826E98E0B4,
  0x838D119F74A416C7, 0x70D6A91E4E5677FD, 0xF3E4027AD30000E6, 0x9BDF692795807F77,
  0x6A371F966E034A54, 0x8789CF41AE4D67EF, 0x02688755484D60AE, 0xD5834B3A4BF5CE42,
  0x9405FC61440DE25D, 0x35EB280A157979B6, 0x48D40D6A525297AC, 0x6A87DC185054BADA
};

/* *INDENT-ON* */
/* clang-format on */

__cold int t1ha_selfcheck__t1ha2_atonce(void) {
  return t1ha_selfcheck(t1ha2_atonce, t1ha_refval_2atonce);
}

__cold static uint64_t thunk_atonce128(const void *data, size_t len,
                                       uint64_t seed) {
  uint64_t unused;
  return t1ha2_atonce128(&unused, data, len, seed);
}

__cold int t1ha_selfcheck__t1ha2_atonce128(void) {
  return t1ha_selfcheck(thunk_atonce128, t1ha_refval_2atonce128);
}

__cold static uint64_t thunk_stream(const void *data, size_t len,
                                    uint64_t seed) {
  t1ha_context_t ctx;
  t1ha2_init(&ctx, seed, seed);
  t1ha2_update(&ctx, data, len);
  return t1ha2_final(&ctx, NULL);
}

__cold static uint64_t thunk_stream128(const void *data, size_t len,
                                       uint64_t seed) {
  t1ha_context_t ctx;
  t1ha2_init(&ctx, seed, seed);
  t1ha2_update(&ctx, data, len);
  uint64_t unused;
  return t1ha2_final(&ctx, &unused);
}

__cold int t1ha_selfcheck__t1ha2_stream(void) {
  return t1ha_selfcheck(thunk_stream, t1ha_refval_2stream) |
         t1ha_selfcheck(thunk_stream128, t1ha_refval_2stream128);
}

__cold int t1ha_selfcheck__t1ha2(void) {
  return t1ha_selfcheck__t1ha2_atonce() | t1ha_selfcheck__t1ha2_atonce128() |
         t1ha_selfcheck__t1ha2_stream();
}

#endif /* T1HA2_DISABLED */


================================================
FILE: benchmarks/t1ha/src/t1ha_bits.h
================================================
/*
 *  Copyright (c) 2016-2020 Positive Technologies, https://www.ptsecurity.com,
 *  Fast Positive Hash.
 *
 *  Portions Copyright (c) 2010-2020 Leonid Yuriev <leo@yuriev.ru>,
 *  The 1Hippeus project (t1h).
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty. In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgement in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 */

/*
 * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
 * by [Positive Technologies](https://www.ptsecurity.ru)
 *
 * Briefly, it is a 64-bit Hash Function:
 *  1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
 *     but portable and without penalties it can run on any 64-bit CPU.
 *  2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
 *     and all others portable hash-functions (which do not use specific
 *     hardware tricks).
 *  3. Not suitable for cryptography.
 *
 * The Future will (be) Positive. Всё будет хорошо.
 *
 * ACKNOWLEDGEMENT:
 * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
 * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
 */

#pragma once

#if defined(_MSC_VER)
#pragma warning(disable : 4201) /* nameless struct/union */
#if _MSC_VER > 1800
#pragma warning(disable : 4464) /* relative include path contains '..' */
#endif                          /* 1800 */
#endif                          /* MSVC */
#include "../t1ha.h"

#ifndef T1HA_USE_FAST_ONESHOT_READ
/* Define it to 1 for little bit faster code.
 * Unfortunately this may triggering a false-positive alarms from Valgrind,
 * AddressSanitizer and other similar tool.
 * So, define it to 0 for calmness if doubt. */
#define T1HA_USE_FAST_ONESHOT_READ 1
#endif /* T1HA_USE_FAST_ONESHOT_READ */

/*****************************************************************************/

#include <assert.h>  /* for assert() */
#include <stdbool.h> /* for bool */
#include <string.h>  /* for memcpy() */

#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ &&                               \
    __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
#error Unsupported byte order.
#endif

#define T1HA_UNALIGNED_ACCESS__UNABLE 0
#define T1HA_UNALIGNED_ACCESS__SLOW 1
#define T1HA_UNALIGNED_ACCESS__EFFICIENT 2

#ifndef T1HA_SYS_UNALIGNED_ACCESS
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__EFFICIENT
#elif defined(__ia32__)
#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__EFFICIENT
#elif defined(__e2k__)
#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__SLOW
#else
#define T1HA_SYS_UNALIGNED_ACCESS T1HA_UNALIGNED_ACCESS__UNABLE
#endif
#endif /* T1HA_SYS_UNALIGNED_ACCESS */

#define ALIGNMENT_16 2
#define ALIGNMENT_32 4
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
#define ALIGNMENT_64 8
#else
#define ALIGNMENT_64 4
#endif

#ifndef PAGESIZE
#define PAGESIZE 4096
#endif /* PAGESIZE */

/***************************************************************************/

#ifndef __has_builtin
#define __has_builtin(x) (0)
#endif

#ifndef __has_warning
#define __has_warning(x) (0)
#endif

#ifndef __has_feature
#define __has_feature(x) (0)
#endif

#ifndef __has_extension
#define __has_extension(x) (0)
#endif

#if __has_feature(address_sanitizer)
#define __SANITIZE_ADDRESS__ 1
#endif

#ifndef __optimize
#if defined(__clang__) && !__has_attribute(__optimize__)
#define __optimize(ops)
#elif defined(__GNUC__) || __has_attribute(__optimize__)
#define __optimize(ops) __attribute__((__optimize__(ops)))
#else
#define __optimize(ops)
#endif
#endif /* __optimize */

#ifndef __cold
#if defined(__OPTIMIZE__)
#if defined(__e2k__)
#define __cold __optimize(1) __attribute__((__cold__))
#elif defined(__clang__) && !__has_attribute(__cold__) &&                      \
    __has_attribute(__section__)
/* just put infrequently used functions in separate section */
#define __cold __attribute__((__section__("text.unlikely"))) __optimize("Os")
#elif defined(__GNUC__) || __has_attribute(__cold__)
#define __cold __attribute__((__cold__)) __optimize("Os")
#else
#define __cold __optimize("Os")
#endif
#else
#define __cold
#endif
#endif /* __cold */

#if __GNUC_PREREQ(4, 4) || defined(__clang__)

#if defined(__ia32__) || defined(__e2k__)
#include <x86intrin.h>
#endif

#if defined(__ia32__) && !defined(__cpuid_count)
#include <cpuid.h>
#endif

#if defined(__e2k__)
#include <e2kbuiltin.h>
#endif

#ifndef likely
#define likely(cond) __builtin_expect(!!(cond), 1)
#endif

#ifndef unlikely
#define unlikely(cond) __builtin_expect(!!(cond), 0)
#endif

#if __GNUC_PREREQ(4, 5) || __has_builtin(__builtin_unreachable)
#define unreachable() __builtin_unreachable()
#endif

#define bswap64(v) __builtin_bswap64(v)
#define bswap32(v) __builtin_bswap32(v)
#if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
#define bswap16(v) __builtin_bswap16(v)
#endif

#if !defined(__maybe_unused) &&                                                \
    (__GNUC_PREREQ(4, 3) || __has_attribute(__unused__))
#define __maybe_unused __attribute__((__unused__))
#endif

#if !defined(__always_inline) &&                                               \
    (__GNUC_PREREQ(3, 2) || __has_attribute(__always_inline__))
#define __always_inline __inline __attribute__((__always_inline__))
#endif

#if defined(__e2k__)

#if __iset__ >= 3
#define mul_64x64_high(a, b) __builtin_e2k_umulhd(a, b)
#endif /* __iset__ >= 3 */

#if __iset__ >= 5
static __maybe_unused __always_inline unsigned
e2k_add64carry_first(uint64_t base, uint64_t addend, uint64_t *sum) {
  *sum = base + addend;
  return (unsigned)__builtin_e2k_addcd_c(base, addend, 0);
}
#define add64carry_first(base, addend, sum)                                    \
  e2k_add64carry_first(base, addend, sum)

static __maybe_unused __always_inline unsigned
e2k_add64carry_next(unsigned carry, uint64_t base, uint64_t addend,
                    uint64_t *sum) {
  *sum = __builtin_e2k_addcd(base, addend, carry);
  return (unsigned)__builtin_e2k_addcd_c(base, addend, carry);
}
#define add64carry_next(carry, base, addend, sum)                              \
  e2k_add64carry_next(carry, base, addend, sum)

static __maybe_unused __always_inline void e2k_add64carry_last(unsigned carry,
                                                               uint64_t base,
                                                               uint64_t addend,
                                                               uint64_t *sum) {
  *sum = __builtin_e2k_addcd(base, addend, carry);
}
#define add64carry_last(carry, base, addend, sum)                              \
  e2k_add64carry_last(carry, base, addend, sum)
#endif /* __iset__ >= 5 */

#define fetch64_be_aligned(ptr) ((uint64_t)__builtin_e2k_ld_64s_be(ptr))
#define fetch32_be_aligned(ptr) ((uint32_t)__builtin_e2k_ld_32u_be(ptr))

#endif /* __e2k__ Elbrus */

#elif defined(_MSC_VER)

#if _MSC_FULL_VER < 190024234 && defined(_M_IX86)
#pragma message(                                                               \
    "For AES-NI at least \"Microsoft C/C++ Compiler\" version 19.00.24234 (Visual Studio 2015 Update 3) is required.")
#endif
#if _MSC_FULL_VER < 191526730
#pragma message(                                                               \
    "It is recommended to use \"Microsoft C/C++ Compiler\" version 19.15.26730 (Visual Studio 2017 15.8) or newer.")
#endif
#if _MSC_FULL_VER < 180040629
#error At least "Microsoft C/C++ Compiler" version 18.00.40629 (Visual Studio 2013 Update 5) is required.
#endif

#pragma warning(push, 1)

#include <intrin.h>
#include <stdlib.h>
#define likely(cond) (cond)
#define unlikely(cond) (cond)
#define unreachable() __assume(0)
#define bswap64(v) _byteswap_uint64(v)
#define bswap32(v) _byteswap_ulong(v)
#define bswap16(v) _byteswap_ushort(v)
#define rot64(v, s) _rotr64(v, s)
#define rot32(v, s) _rotr(v, s)
#define __always_inline __forceinline

#if defined(_M_X64) || defined(_M_IA64)
#pragma intrinsic(_umul128)
#define mul_64x64_128(a, b, ph) _umul128(a, b, ph)
#pragma intrinsic(_addcarry_u64)
#define add64carry_first(base, addend, sum) _addcarry_u64(0, base, addend, sum)
#define add64carry_next(carry, base, addend, sum)                              \
  _addcarry_u64(carry, base, addend, sum)
#define add64carry_last(carry, base, addend, sum)                              \
  (void)_addcarry_u64(carry, base, addend, sum)
#endif

#if defined(_M_ARM64) || defined(_M_X64) || defined(_M_IA64)
#pragma intrinsic(__umulh)
#define mul_64x64_high(a, b) __umulh(a, b)
#endif

#if defined(_M_IX86)
#pragma intrinsic(__emulu)
#define mul_32x32_64(a, b) __emulu(a, b)

#if _MSC_VER >= 1915 /* LY: workaround for SSA-optimizer bug */
#pragma intrinsic(_addcarry_u32)
#define add32carry_first(base, addend, sum) _addcarry_u32(0, base, addend, sum)
#define add32carry_next(carry, base, addend, sum)                              \
  _addcarry_u32(carry, base, addend, sum)
#define add32carry_last(carry, base, addend, sum)                              \
  (void)_addcarry_u32(carry, base, addend, sum)

static __forceinline char
msvc32_add64carry_first(uint64_t base, uint64_t addend, uint64_t *sum) {
  uint32_t *const sum32 = (uint32_t *)sum;
  const uint32_t base_32l = (uint32_t)base;
  const uint32_t base_32h = (uint32_t)(base >> 32);
  const uint32_t addend_32l = (uint32_t)addend;
  const uint32_t addend_32h = (uint32_t)(addend >> 32);
  return add32carry_next(add32carry_first(base_32l, addend_32l, sum32),
                         base_32h, addend_32h, sum32 + 1);
}
#define add64carry_first(base, addend, sum)                                    \
  msvc32_add64carry_first(base, addend, sum)

static __forceinline char msvc32_add64carry_next(char carry, uint64_t base,
                                                 uint64_t addend,
                                                 uint64_t *sum) {
  uint32_t *const sum32 = (uint32_t *)sum;
  const uint32_t base_32l = (uint32_t)base;
  const uint32_t base_32h = (uint32_t)(base >> 32);
  const uint32_t addend_32l = (uint32_t)addend;
  const uint32_t addend_32h = (uint32_t)(addend >> 32);
  return add32carry_next(add32carry_next(carry, base_32l, addend_32l, sum32),
                         base_32h, addend_32h, sum32 + 1);
}
#define add64carry_next(carry, base, addend, sum)                              \
  msvc32_add64carry_next(carry, base, addend, sum)

static __forceinline void msvc32_add64carry_last(char carry, uint64_t base,
                                                 uint64_t addend,
                                                 uint64_t *sum) {
  uint32_t *const sum32 = (uint32_t *)sum;
  const uint32_t base_32l = (uint32_t)base;
  const uint32_t base_32h = (uint32_t)(base >> 32);
  const uint32_t addend_32l = (uint32_t)addend;
  const uint32_t addend_32h = (uint32_t)(addend >> 32);
  add32carry_last(add32carry_next(carry, base_32l, addend_32l, sum32), base_32h,
                  addend_32h, sum32 + 1);
}
#define add64carry_last(carry, base, addend, sum)                              \
  msvc32_add64carry_last(carry, base, addend, sum)
#endif /* _MSC_FULL_VER >= 190024231 */

#elif defined(_M_ARM)
#define mul_32x32_64(a, b) _arm_umull(a, b)
#endif

#pragma warning(pop)
#pragma warning(disable : 4514) /* 'xyz': unreferenced inline function         \
                                   has been removed */
#pragma warning(disable : 4710) /* 'xyz': function not inlined */
#pragma warning(disable : 4711) /* function 'xyz' selected for                 \
                                   automatic inline expansion */
#pragma warning(disable : 4127) /* conditional expression is constant */
#pragma warning(disable : 4702) /* unreachable code */
#endif                          /* Compiler */

#ifndef likely
#define likely(cond) (cond)
#endif
#ifndef unlikely
#define unlikely(cond) (cond)
#endif
#ifndef __maybe_unused
#define __maybe_unused
#endif
#ifndef __always_inline
#define __always_inline __inline
#endif
#ifndef unreachable
#define unreachable()                                                          \
  do {                                                                         \
  } while (1)
#endif

#ifndef bswap64
#if defined(bswap_64)
#define bswap64 bswap_64
#elif defined(__bswap_64)
#define bswap64 __bswap_64
#else
static __always_inline uint64_t bswap64(uint64_t v) {
  return v << 56 | v >> 56 | ((v << 40) & UINT64_C(0x00ff000000000000)) |
         ((v << 24) & UINT64_C(0x0000ff0000000000)) |
         ((v << 8) & UINT64_C(0x000000ff00000000)) |
         ((v >> 8) & UINT64_C(0x00000000ff000000)) |
         ((v >> 24) & UINT64_C(0x0000000000ff0000)) |
         ((v >> 40) & UINT64_C(0x000000000000ff00));
}
#endif
#endif /* bswap64 */

#ifndef bswap32
#if defined(bswap_32)
#define bswap32 bswap_32
#elif defined(__bswap_32)
#define bswap32 __bswap_32
#else
static __always_inline uint32_t bswap32(uint32_t v) {
  return v << 24 | v >> 24 | ((v << 8) & UINT32_C(0x00ff0000)) |
         ((v >> 8) & UINT32_C(0x0000ff00));
}
#endif
#endif /* bswap32 */

#ifndef bswap16
#if defined(bswap_16)
#define bswap16 bswap_16
#elif defined(__bswap_16)
#define bswap16 __bswap_16
#else
static __always_inline uint16_t bswap16(uint16_t v) { return v << 8 | v >> 8; }
#endif
#endif /* bswap16 */

#if defined(__ia32__) ||                                                       \
    T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT
/* The __builtin_assume_aligned() leads gcc/clang to load values into the
 * registers, even when it is possible to directly use an operand from memory.
 * This can lead to a shortage of registers and a significant slowdown.
 * Therefore avoid unnecessary use of  __builtin_assume_aligned() for x86. */
#define read_unaligned(ptr, bits) (*(const uint##bits##_t *__restrict)(ptr))
#define read_aligned(ptr, bits) (*(const uint##bits##_t *__restrict)(ptr))
#endif /* __ia32__ */

#ifndef read_unaligned
#if defined(__GNUC__) || __has_attribute(__packed__)
typedef struct {
  uint8_t unaligned_8;
  uint16_t unaligned_16;
  uint32_t unaligned_32;
  uint64_t unaligned_64;
} __attribute__((__packed__)) t1ha_unaligned_proxy;
#define read_unaligned(ptr, bits)                                              \
  (((const t1ha_unaligned_proxy *)((const uint8_t *)(ptr)-offsetof(            \
        t1ha_unaligned_proxy, unaligned_##bits)))                              \
       ->unaligned_##bits)
#elif defined(_MSC_VER)
#pragma warning(                                                               \
    disable : 4235) /* nonstandard extension used: '__unaligned'               \
                     * keyword not supported on this architecture */
#define read_unaligned(ptr, bits) (*(const __unaligned uint##bits##_t *)(ptr))
#else
#pragma pack(push, 1)
typedef struct {
  uint8_t unaligned_8;
  uint16_t unaligned_16;
  uint32_t unaligned_32;
  uint64_t unaligned_64;
} t1ha_unaligned_proxy;
#pragma pack(pop)
#define read_unaligned(ptr, bits)                                              \
  (((const t1ha_unaligned_proxy *)((const uint8_t *)(ptr)-offsetof(            \
        t1ha_unaligned_proxy, unaligned_##bits)))                              \
       ->unaligned_##bits)
#endif
#endif /* read_unaligned */

#ifndef read_aligned
#if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_assume_aligned)
#define read_aligned(ptr, bits)                                                \
  (*(const uint##bits##_t *)__builtin_assume_aligned(ptr, ALIGNMENT_##bits))
#elif (__GNUC_PREREQ(3, 3) || __has_attribute(__aligned__)) &&                 \
    !defined(__clang__)
#define read_aligned(ptr, bits)                                                \
  (*(const uint##bits##_t                                                      \
     __attribute__((__aligned__(ALIGNMENT_##bits))) *)(ptr))
#elif __has_attribute(__assume_aligned__)

static __always_inline const
    uint16_t *__attribute__((__assume_aligned__(ALIGNMENT_16)))
    cast_aligned_16(const void *ptr) {
  return (const uint16_t *)ptr;
}
static __always_inline const
    uint32_t *__attribute__((__assume_aligned__(ALIGNMENT_32)))
    cast_aligned_32(const void *ptr) {
  return (const uint32_t *)ptr;
}
static __always_inline const
    uint64_t *__attribute__((__assume_aligned__(ALIGNMENT_64)))
    cast_aligned_64(const void *ptr) {
  return (const uint64_t *)ptr;
}

#define read_aligned(ptr, bits) (*cast_aligned_##bits(ptr))

#elif defined(_MSC_VER)
#define read_aligned(ptr, bits)                                                \
  (*(const __declspec(align(ALIGNMENT_##bits)) uint##bits##_t *)(ptr))
#else
#define read_aligned(ptr, bits) (*(const uint##bits##_t *)(ptr))
#endif
#endif /* read_aligned */

#ifndef prefetch
#if (__GNUC_PREREQ(4, 0) || __has_builtin(__builtin_prefetch)) &&              \
    !defined(__ia32__)
#define prefetch(ptr) __builtin_prefetch(ptr)
#elif defined(_M_ARM64) || defined(_M_ARM)
#define prefetch(ptr) __prefetch(ptr)
#else
#define prefetch(ptr)                                                          \
  do {                                                                         \
    (void)(ptr);                                                               \
  } while (0)
#endif
#endif /* prefetch */

#if __has_warning("-Wconstant-logical-operand")
#if defined(__clang__)
#pragma clang diagnostic ignored "-Wconstant-logical-operand"
#elif defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wconstant-logical-operand"
#else
#pragma warning disable "constant-logical-operand"
#endif
#endif /* -Wconstant-logical-operand */

#if __has_warning("-Wtautological-pointer-compare")
#if defined(__clang__)
#pragma clang diagnostic ignored "-Wtautological-pointer-compare"
#elif defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wtautological-pointer-compare"
#else
#pragma warning disable "tautological-pointer-compare"
#endif
#endif /* -Wtautological-pointer-compare */

/***************************************************************************/

#if __GNUC_PREREQ(4, 0)
#pragma GCC visibility push(hidden)
#endif /* __GNUC_PREREQ(4,0) */

/*---------------------------------------------------------- Little Endian */

#ifndef fetch16_le_aligned
static __maybe_unused __always_inline uint16_t
fetch16_le_aligned(const void *v) {
  assert(((uintptr_t)v) % ALIGNMENT_16 == 0);
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  return read_aligned(v, 16);
#else
  return bswap16(read_aligned(v, 16));
#endif
}
#endif /* fetch16_le_aligned */

#ifndef fetch16_le_unaligned
static __maybe_unused __always_inline uint16_t
fetch16_le_unaligned(const void *v) {
#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE &&              \
    !defined(__ARM_FEATURE_UNALIGNED)
  const uint8_t *p = (const uint8_t *)v;
  return p[0] | (uint16_t)p[1] << 8;
#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  return read_unaligned(v, 16);
#else
  return bswap16(read_unaligned(v, 16));
#endif
}
#endif /* fetch16_le_unaligned */

#ifndef fetch32_le_aligned
static __maybe_unused __always_inline uint32_t
fetch32_le_aligned(const void *v) {
  assert(((uintptr_t)v) % ALIGNMENT_32 == 0);
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  return read_aligned(v, 32);
#else
  return bswap32(read_aligned(v, 32));
#endif
}
#endif /* fetch32_le_aligned */

#ifndef fetch32_le_unaligned
static __maybe_unused __always_inline uint32_t
fetch32_le_unaligned(const void *v) {
#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE &&              \
    !defined(__ARM_FEATURE_UNALIGNED)
  return fetch16_le_unaligned(v) |
         (uint32_t)fetch16_le_unaligned((const uint8_t *)v + 2) << 16;
#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  return read_unaligned(v, 32);
#else
  return bswap32(read_unaligned(v, 32));
#endif
}
#endif /* fetch32_le_unaligned */

#ifndef fetch64_le_aligned
static __maybe_unused __always_inline uint64_t
fetch64_le_aligned(const void *v) {
  assert(((uintptr_t)v) % ALIGNMENT_64 == 0);
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  return read_aligned(v, 64);
#else
  return bswap64(read_aligned(v, 64));
#endif
}
#endif /* fetch64_le_aligned */

#ifndef fetch64_le_unaligned
static __maybe_unused __always_inline uint64_t
fetch64_le_unaligned(const void *v) {
#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE
  return fetch32_le_unaligned(v) |
         (uint64_t)fetch32_le_unaligned((const uint8_t *)v + 4) << 32;
#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  return read_unaligned(v, 64);
#else
  return bswap64(read_unaligned(v, 64));
#endif
}
#endif /* fetch64_le_unaligned */

static __maybe_unused __always_inline uint64_t tail64_le_aligned(const void *v,
                                                                 size_t tail) {
  const uint8_t *const p = (const uint8_t *)v;
#if T1HA_USE_FAST_ONESHOT_READ && !defined(__SANITIZE_ADDRESS__)
  /* We can perform a 'oneshot' read, which is little bit faster. */
  const unsigned shift = ((8 - tail) & 7) << 3;
  return fetch64_le_aligned(p) & ((~UINT64_C(0)) >> shift);
#else
  uint64_t r = 0;
  switch (tail & 7) {
  default:
    unreachable();
/* fall through */
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  /* For most CPUs this code is better when not needed byte reordering. */
  case 0:
    return fetch64_le_aligned(p);
  case 7:
    r = (uint64_t)p[6] << 8;
  /* fall through */
  case 6:
    r += p[5];
    r <<= 8;
  /* fall through */
  case 5:
    r += p[4];
    r <<= 32;
  /* fall through */
  case 4:
    return r + fetch32_le_aligned(p);
  case 3:
    r = (uint64_t)p[2] << 16;
  /* fall through */
  case 2:
    return r + fetch16_le_aligned(p);
  case 1:
    return p[0];
#else
  case 0:
    r = p[7] << 8;
  /* fall through */
  case 7:
    r += p[6];
    r <<= 8;
  /* fall through */
  case 6:
    r += p[5];
    r <<= 8;
  /* fall through */
  case 5:
    r += p[4];
    r <<= 8;
  /* fall through */
  case 4:
    r += p[3];
    r <<= 8;
  /* fall through */
  case 3:
    r += p[2];
    r <<= 8;
  /* fall through */
  case 2:
    r += p[1];
    r <<= 8;
  /* fall through */
  case 1:
    return r + p[0];
#endif
  }
#endif /* T1HA_USE_FAST_ONESHOT_READ */
}

#if T1HA_USE_FAST_ONESHOT_READ &&                                              \
    T1HA_SYS_UNALIGNED_ACCESS != T1HA_UNALIGNED_ACCESS__UNABLE &&              \
    defined(PAGESIZE) && PAGESIZE > 42 && !defined(__SANITIZE_ADDRESS__)
#define can_read_underside(ptr, size)                                          \
  (((PAGESIZE - (size)) & (uintptr_t)(ptr)) != 0)
#endif /* T1HA_USE_FAST_ONESHOT_READ */

static __maybe_unused __always_inline uint64_t
tail64_le_unaligned(const void *v, size_t tail) {
  const uint8_t *p = (const uint8_t *)v;
#if defined(can_read_underside) &&                                             \
    (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul)
  /* On some systems (e.g. x86_64) we can perform a 'oneshot' read, which
   * is little bit faster. Thanks Marcin Żukowski <marcin.zukowski@gmail.com>
   * for the reminder. */
  const unsigned offset = (8 - tail) & 7;
  const unsigned shift = offset << 3;
  if (likely(can_read_underside(p, 8))) {
    p -= offset;
    return fetch64_le_unaligned(p) >> shift;
  }
  return fetch64_le_unaligned(p) & ((~UINT64_C(0)) >> shift);
#else
  uint64_t r = 0;
  switch (tail & 7) {
  default:
    unreachable();
/* fall through */
#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT &&           \
    __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  /* For most CPUs this code is better when not needed
   * copying for alignment or byte reordering. */
  case 0:
    return fetch64_le_unaligned(p);
  case 7:
    r = (uint64_t)p[6] << 8;
  /* fall through */
  case 6:
    r += p[5];
    r <<= 8;
  /* fall through */
  case 5:
    r += p[4];
    r <<= 32;
  /* fall through */
  case 4:
    return r + fetch32_le_unaligned(p);
  case 3:
    r = (uint64_t)p[2] << 16;
  /* fall through */
  case 2:
    return r + fetch16_le_unaligned(p);
  case 1:
    return p[0];
#else
  /* For most CPUs this code is better than a
   * copying for alignment and/or byte reordering. */
  case 0:
    r = p[7] << 8;
  /* fall through */
  case 7:
    r += p[6];
    r <<= 8;
  /* fall through */
  case 6:
    r += p[5];
    r <<= 8;
  /* fall through */
  case 5:
    r += p[4];
    r <<= 8;
  /* fall through */
  case 4:
    r += p[3];
    r <<= 8;
  /* fall through */
  case 3:
    r += p[2];
    r <<= 8;
  /* fall through */
  case 2:
    r += p[1];
    r <<= 8;
  /* fall through */
  case 1:
    return r + p[0];
#endif
  }
#endif /* can_read_underside */
}

/*------------------------------------------------------------- Big Endian */

#ifndef fetch16_be_aligned
static __maybe_unused __always_inline uint16_t
fetch16_be_aligned(const void *v) {
  assert(((uintptr_t)v) % ALIGNMENT_16 == 0);
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  return read_aligned(v, 16);
#else
  return bswap16(read_aligned(v, 16));
#endif
}
#endif /* fetch16_be_aligned */

#ifndef fetch16_be_unaligned
static __maybe_unused __always_inline uint16_t
fetch16_be_unaligned(const void *v) {
#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE
  const uint8_t *p = (const uint8_t *)v;
  return (uint16_t)p[0] << 8 | p[1];
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  return read_unaligned(v, 16);
#else
  return bswap16(read_unaligned(v, 16));
#endif
}
#endif /* fetch16_be_unaligned */

#ifndef fetch32_be_aligned
static __maybe_unused __always_inline uint32_t
fetch32_be_aligned(const void *v) {
  assert(((uintptr_t)v) % ALIGNMENT_32 == 0);
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  return read_aligned(v, 32);
#else
  return bswap32(read_aligned(v, 32));
#endif
}
#endif /* fetch32_be_aligned */

#ifndef fetch32_be_unaligned
static __maybe_unused __always_inline uint32_t
fetch32_be_unaligned(const void *v) {
#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE
  return (uint32_t)fetch16_be_unaligned(v) << 16 |
         fetch16_be_unaligned((const uint8_t *)v + 2);
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  return read_unaligned(v, 32);
#else
  return bswap32(read_unaligned(v, 32));
#endif
}
#endif /* fetch32_be_unaligned */

#ifndef fetch64_be_aligned
static __maybe_unused __always_inline uint64_t
fetch64_be_aligned(const void *v) {
  assert(((uintptr_t)v) % ALIGNMENT_64 == 0);
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  return read_aligned(v, 64);
#else
  return bswap64(read_aligned(v, 64));
#endif
}
#endif /* fetch64_be_aligned */

#ifndef fetch64_be_unaligned
static __maybe_unused __always_inline uint64_t
fetch64_be_unaligned(const void *v) {
#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__UNABLE
  return (uint64_t)fetch32_be_unaligned(v) << 32 |
         fetch32_be_unaligned((const uint8_t *)v + 4);
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  return read_unaligned(v, 64);
#else
  return bswap64(read_unaligned(v, 64));
#endif
}
#endif /* fetch64_be_unaligned */

static __maybe_unused __always_inline uint64_t tail64_be_aligned(const void *v,
                                                                 size_t tail) {
  const uint8_t *const p = (const uint8_t *)v;
#if T1HA_USE_FAST_ONESHOT_READ && !defined(__SANITIZE_ADDRESS__)
  /* We can perform a 'oneshot' read, which is little bit faster. */
  const unsigned shift = ((8 - tail) & 7) << 3;
  return fetch64_be_aligned(p) >> shift;
#else
  switch (tail & 7) {
  default:
    unreachable();
/* fall through */
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  /* For most CPUs this code is better when not byte reordering. */
  case 1:
    return p[0];
  case 2:
    return fetch16_be_aligned(p);
  case 3:
    return (uint32_t)fetch16_be_aligned(p) << 8 | p[2];
  case 4:
    return fetch32_be_aligned(p);
  case 5:
    return (uint64_t)fetch32_be_aligned(p) << 8 | p[4];
  case 6:
    return (uint64_t)fetch32_be_aligned(p) << 16 | fetch16_be_aligned(p + 4);
  case 7:
    return (uint64_t)fetch32_be_aligned(p) << 24 |
           (uint32_t)fetch16_be_aligned(p + 4) << 8 | p[6];
  case 0:
    return fetch64_be_aligned(p);
#else
  case 1:
    return p[0];
  case 2:
    return p[1] | (uint32_t)p[0] << 8;
  case 3:
    return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16;
  case 4:
    return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 |
           (uint32_t)p[0] << 24;
  case 5:
    return p[4] | (uint32_t)p[3] << 8 | (uint32_t)p[2] << 16 |
           (uint32_t)p[1] << 24 | (uint64_t)p[0] << 32;
  case 6:
    return p[5] | (uint32_t)p[4] << 8 | (uint32_t)p[3] << 16 |
           (uint32_t)p[2] << 24 | (uint64_t)p[1] << 32 | (uint64_t)p[0] << 40;
  case 7:
    return p[6] | (uint32_t)p[5] << 8 | (uint32_t)p[4] << 16 |
           (uint32_t)p[3] << 24 | (uint64_t)p[2] << 32 | (uint64_t)p[1] << 40 |
           (uint64_t)p[0] << 48;
  case 0:
    return p[7] | (uint32_t)p[6] << 8 | (uint32_t)p[5] << 16 |
           (uint32_t)p[4] << 24 | (uint64_t)p[3] << 32 | (uint64_t)p[2] << 40 |
           (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56;
#endif
  }
#endif /* T1HA_USE_FAST_ONESHOT_READ */
}

static __maybe_unused __always_inline uint64_t
tail64_be_unaligned(const void *v, size_t tail) {
  const uint8_t *p = (const uint8_t *)v;
#if defined(can_read_underside) &&                                             \
    (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul)
  /* On some systems (e.g. x86_64) we can perform a 'oneshot' read, which
   * is little bit faster. Thanks Marcin Żukowski <marcin.zukowski@gmail.com>
   * for the reminder. */
  const unsigned offset = (8 - tail) & 7;
  const unsigned shift = offset << 3;
  if (likely(can_read_underside(p, 8))) {
    p -= offset;
    return fetch64_be_unaligned(p) & ((~UINT64_C(0)) >> shift);
  }
  return fetch64_be_unaligned(p) >> shift;
#else
  switch (tail & 7) {
  default:
    unreachable();
/* fall through */
#if T1HA_SYS_UNALIGNED_ACCESS == T1HA_UNALIGNED_ACCESS__EFFICIENT &&           \
    __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  /* For most CPUs this code is better when not needed
   * copying for alignment or byte reordering. */
  case 1:
    return p[0];
  case 2:
    return fetch16_be_unaligned(p);
  case 3:
    return (uint32_t)fetch16_be_unaligned(p) << 8 | p[2];
  case 4:
    return fetch32_be(p);
  case 5:
    return (uint64_t)fetch32_be_unaligned(p) << 8 | p[4];
  case 6:
    return (uint64_t)fetch32_be_unaligned(p) << 16 |
           fetch16_be_unaligned(p + 4);
  case 7:
    return (uint64_t)fetch32_be_unaligned(p) << 24 |
           (uint32_t)fetch16_be_unaligned(p + 4) << 8 | p[6];
  case 0:
    return fetch64_be_unaligned(p);
#else
  /* For most CPUs this code is better than a
   * copying for alignment and/or byte reordering. */
  case 1:
    return p[0];
  case 2:
    return p[1] | (uint32_t)p[0] << 8;
  case 3:
    return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16;
  case 4:
    return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 |
           (uint32_t)p[0] << 24;
  case 5:
    return p[4] | (uint32_t)p[3] << 8 | (uint32_t)p[2] << 16 |
           (uint32_t)p[1] << 24 | (uint64_t)p[0] << 32;
  case 6:
    return p[5] | (uint32_t)p[4] << 8 | (uint32_t)p[3] << 16 |
           (uint32_t)p[2] << 24 | (uint64_t)p[1] << 32 | (uint64_t)p[0] << 40;
  case 7:
    return p[6] | (uint32_t)p[5] << 8 | (uint32_t)p[4] << 16 |
           (uint32_t)p[3] << 24 | (uint64_t)p[2] << 32 | (uint64_t)p[1] << 40 |
           (uint64_t)p[0] << 48;
  case 0:
    return p[7] | (uint32_t)p[6] << 8 | (uint32_t)p[5] << 16 |
           (uint32_t)p[4] << 24 | (uint64_t)p[3] << 32 | (uint64_t)p[2] << 40 |
           (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56;
#endif
  }
#endif /* can_read_underside */
}

/***************************************************************************/

#ifndef rot64
static __maybe_unused __always_inline uint64_t rot64(uint64_t v, unsigned s) {
  return (v >> s) | (v << (64 - s));
}
#endif /* rot64 */

#ifndef mul_32x32_64
static __maybe_unused __always_inline uint64_t mul_32x32_64(uint32_t a,
                                                            uint32_t b) {
  return a * (uint64_t)b;
}
#endif /* mul_32x32_64 */

#ifndef add64carry_first
static __maybe_unused __always_inline unsigned
add64carry_first(uint64_t base, uint64_t addend, uint64_t *sum) {
#if __has_builtin(__builtin_addcll)
  unsigned long long carryout;
  *sum = __builtin_addcll(base, addend, 0, &carryout);
  return (unsigned)carryout;
#else
  *sum = base + addend;
  return *sum < addend;
#endif /* __has_builtin(__builtin_addcll) */
}
#endif /* add64carry_fist */

#ifndef add64carry_next
static __maybe_unused __always_inline unsigned
add64carry_next(unsigned carry, uint64_t base, uint64_t addend, uint64_t *sum) {
#if __has_builtin(__builtin_addcll)
  unsigned long long carryout;
  *sum = __builtin_addcll(base, addend, carry, &carryout);
  return (unsigned)carryout;
#else
  *sum = base + addend + carry;
  return *sum < addend || (carry && *sum == addend);
#endif /* __has_builtin(__builtin_addcll) */
}
#endif /* add64carry_next */

#ifndef add64carry_last
static __maybe_unused __always_inline void
add64carry_last(unsigned carry, uint64_t base, uint64_t addend, uint64_t *sum) {
#if __has_builtin(__builtin_addcll)
  unsigned long long carryout;
  *sum = __builtin_addcll(base, addend, carry, &carryout);
  (void)carryout;
#else
  *sum = base + addend + carry;
#endif /* __has_builtin(__builtin_addcll) */
}
#endif /* add64carry_last */

#ifndef mul_64x64_128
static __maybe_unused __always_inline uint64_t mul_64x64_128(uint64_t a,
                                                             uint64_t b,
                                                             uint64_t *h) {
#if (defined(__SIZEOF_INT128__) ||                                             \
     (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)) &&            \
    (!defined(__LCC__) || __LCC__ != 124)
  __uint128_t r = (__uint128_t)a * (__uint128_t)b;
  /* modern GCC could nicely optimize this */
  *h = (uint64_t)(r >> 64);
  return (uint64_t)r;
#elif defined(mul_64x64_high)
  *h = mul_64x64_high(a, b);
  return a * b;
#else
  /* performs 64x64 to 128 bit multiplication */
  const uint64_t ll = mul_32x32_64((uint32_t)a, (uint32_t)b);
  const uint64_t lh = mul_32x32_64(a >> 32, (uint32_t)b);
  const uint64_t hl = mul_32x32_64((uint32_t)a, b >> 32);
  const uint64_t hh = mul_32x32_64(a >> 32, b >> 32);

  /* Few simplification are possible here for 32-bit architectures,
   * but thus we would lost compatibility with the original 64-bit
   * version.  Think is very bad idea, because then 32-bit t1ha will
   * still (relatively) very slowly and well yet not compatible. */
  uint64_t l;
  add64carry_last(add64carry_first(ll, lh << 32, &l), hh, lh >> 32, h);
  add64carry_last(add64carry_first(l, hl << 32, &l), *h, hl >> 32, h);
  return l;
#endif
}
#endif /* mul_64x64_128() */

#ifndef mul_64x64_high
static __maybe_unused __always_inline uint64_t mul_64x64_high(uint64_t a,
                                                              uint64_t b) {
  uint64_t h;
  mul_64x64_128(a, b, &h);
  return h;
}
#endif /* mul_64x64_high */

/***************************************************************************/

/* 'magic' primes */
static const uint64_t prime_0 = UINT64_C(0xEC99BF0D8372CAAB);
static const uint64_t prime_1 = UINT64_C(0x82434FE90EDCEF39);
static const uint64_t prime_2 = UINT64_C(0xD4F06DB99D67BE4B);
static const uint64_t prime_3 = UINT64_C(0xBD9CACC22C6E9571);
static const uint64_t prime_4 = UINT64_C(0x9C06FAF4D023E3AB);
static const uint64_t prime_5 = UINT64_C(0xC060724A8424F345);
static const uint64_t prime_6 = UINT64_C(0xCB5AF53AE3AAAC31);

/* xor high and low parts of full 128-bit product */
static __maybe_unused __always_inline uint64_t mux64(uint64_t v,
                                                     uint64_t prime) {
  uint64_t l, h;
  l = mul_64x64_128(v, prime, &h);
  return l ^ h;
}

static __maybe_unused __always_inline uint64_t final64(uint64_t a, uint64_t b) {
  uint64_t x = (a + rot64(b, 41)) * prime_0;
  uint64_t y = (rot64(a, 23) + b) * prime_6;
  return mux64(x ^ y, prime_5);
}

static __maybe_unused __always_inline void mixup64(uint64_t *__restrict a,
                                                   uint64_t *__restrict b,
                                                   uint64_t v, uint64_t prime) {
  uint64_t h;
  *a ^= mul_64x64_128(*b + v, prime, &h);
  *b += h;
}

/***************************************************************************/

typedef union t1ha_uint128 {
#if defined(__SIZEOF_INT128__) ||                                              \
    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  __uint128_t v;
#endif
  struct {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
    uint64_t l, h;
#else
    uint64_t h, l;
#endif
  };
} t1ha_uint128_t;

static __maybe_unused __always_inline t1ha_uint128_t
not128(const t1ha_uint128_t v) {
  t1ha_uint128_t r;
#if defined(__SIZEOF_INT128__) ||                                              \
    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  r.v = ~v.v;
#else
  r.l = ~v.l;
  r.h = ~v.h;
#endif
  return r;
}

static __maybe_unused __always_inline t1ha_uint128_t
left128(const t1ha_uint128_t v, unsigned s) {
  t1ha_uint128_t r;
  assert(s < 128);
#if defined(__SIZEOF_INT128__) ||                                              \
    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  r.v = v.v << s;
#else
  r.l = (s < 64) ? v.l << s : 0;
  r.h = (s < 64) ? (v.h << s) | (s ? v.l >> (64 - s) : 0) : v.l << (s - 64);
#endif
  return r;
}

static __maybe_unused __always_inline t1ha_uint128_t
right128(const t1ha_uint128_t v, unsigned s) {
  t1ha_uint128_t r;
  assert(s < 128);
#if defined(__SIZEOF_INT128__) ||                                              \
    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  r.v = v.v >> s;
#else
  r.l = (s < 64) ? (s ? v.h << (64 - s) : 0) | (v.l >> s) : v.h >> (s - 64);
  r.h = (s < 64) ? v.h >> s : 0;
#endif
  return r;
}

static __maybe_unused __always_inline t1ha_uint128_t or128(t1ha_uint128_t x,
                                                           t1ha_uint128_t y) {
  t1ha_uint128_t r;
#if defined(__SIZEOF_INT128__) ||                                              \
    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  r.v = x.v | y.v;
#else
  r.l = x.l | y.l;
  r.h = x.h | y.h;
#endif
  return r;
}

static __maybe_unused __always_inline t1ha_uint128_t xor128(t1ha_uint128_t x,
                                                            t1ha_uint128_t y) {
  t1ha_uint128_t r;
#if defined(__SIZEOF_INT128__) ||                                              \
    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  r.v = x.v ^ y.v;
#else
  r.l = x.l ^ y.l;
  r.h = x.h ^ y.h;
#endif
  return r;
}

static __maybe_unused __always_inline t1ha_uint128_t rot128(t1ha_uint128_t v,
                                                            unsigned s) {
  s &= 127;
#if defined(__SIZEOF_INT128__) ||                                              \
    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  v.v = (v.v << (128 - s)) | (v.v >> s);
  return v;
#else
  return s ? or128(left128(v, 128 - s), right128(v, s)) : v;
#endif
}

static __maybe_unused __always_inline t1ha_uint128_t add128(t1ha_uint128_t x,
                                                            t1ha_uint128_t y) {
  t1ha_uint128_t r;
#if defined(__SIZEOF_INT128__) ||                                              \
    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  r.v = x.v + y.v;
#else
  add64carry_last(add64carry_first(x.l, y.l, &r.l), x.h, y.h, &r.h);
#endif
  return r;
}

static __maybe_unused __always_inline t1ha_uint128_t mul128(t1ha_uint128_t x,
                                                            t1ha_uint128_t y) {
  t1ha_uint128_t r;
#if defined(__SIZEOF_INT128__) ||                                              \
    (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
  r.v = x.v * y.v;
#else
  r.l = mul_64x64_128(x.l, y.l, &r.h);
  r.h += x.l * y.h + y.l * x.h;
#endif
  return r;
}

/***************************************************************************/

#if T1HA0_AESNI_AVAILABLE && defined(__ia32__)
uint64_t t1ha_ia32cpu_features(void);

static __maybe_unused __always_inline bool
t1ha_ia32_AESNI_avail(uint64_t ia32cpu_features) {
  /* check for AES-NI */
  return (ia32cpu_features & UINT32_C(0x02000000)) != 0;
}

static __maybe_unused __always_inline bool
t1ha_ia32_AVX_avail(uint64_t ia32cpu_features) {
  /* check for any AVX */
  return (ia32cpu_features & UINT32_C(0x1A000000)) == UINT32_C(0x1A000000);
}

static __maybe_unused __always_inline bool
t1ha_ia32_AVX2_avail(uint64_t ia32cpu_features) {
  /* check for 'Advanced Vector Extensions 2' */
  return ((ia32cpu_features >> 32) & 32) != 0;
}

#endif /* T1HA0_AESNI_AVAILABLE && __ia32__ */


================================================
FILE: benchmarks/t1ha/src/t1ha_selfcheck.c
================================================
/*
 *  Copyright (c) 2016-2020 Positive Technologies, https://www.ptsecurity.com,
 *  Fast Positive Hash.
 *
 *  Portions Copyright (c) 2010-2020 Leonid Yuriev <leo@yuriev.ru>,
 *  The 1Hippeus project (t1h).
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty. In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgement in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 */

/*
 * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
 * by [Positive Technologies](https://www.ptsecurity.ru)
 *
 * Briefly, it is a 64-bit Hash Function:
 *  1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
 *     but portable and without penalties it can run on any 64-bit CPU.
 *  2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
 *     and all others portable hash-functions (which do not use specific
 *     hardware tricks).
 *  3. Not suitable for cryptography.
 *
 * The Future will (be) Positive. Всё будет хорошо.
 *
 * ACKNOWLEDGEMENT:
 * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
 * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
 */

#include "t1ha_selfcheck.h"
#include "t1ha_bits.h"

const uint8_t t1ha_test_pattern[64] = {
    0,    1,    2,    3,    4,    5,    6,    7,    0xFF, 0x7F, 0x3F,
    0x1F, 0xF,  8,    16,   32,   64,   0x80, 0xFE, 0xFC, 0xF8, 0xF0,
    0xE0, 0xC0, 0xFD, 0xFB, 0xF7, 0xEF, 0xDF, 0xBF, 0x55, 0xAA, 11,
    17,   19,   23,   29,   37,   42,   43,   'a',  'b',  'c',  'd',
    'e',  'f',  'g',  'h',  'i',  'j',  'k',  'l',  'm',  'n',  'o',
    'p',  'q',  'r',  's',  't',  'u',  'v',  'w',  'x'};

static __inline bool probe(uint64_t (*hash)(const void *, size_t, uint64_t),
                           const uint64_t reference, const void *data,
                           unsigned len, uint64_t seed) {
  const uint64_t actual = hash(data, len, seed);
  assert(actual == reference);
  return actual != reference;
}

__cold int t1ha_selfcheck(uint64_t (*hash)(const void *, size_t, uint64_t),
                          const uint64_t *reference_values) {
  bool failed = false;

  const uint64_t zero = 0;
  failed |= probe(hash, /* empty-zero */ *reference_values++, NULL, 0, zero);
  failed |= probe(hash, /* empty-all1 */ *reference_values++, NULL, 0, ~zero);
  failed |= probe(hash, /* bin64-zero */ *reference_values++, t1ha_test_pattern,
                  64, zero);

  uint64_t seed = 1;
  for (int i = 1; i < 64; i++) {
    /* bin%i-1p%i */
    failed |= probe(hash, *reference_values++, t1ha_test_pattern, i, seed);
    seed <<= 1;
  }

  seed = ~zero;
  for (int i = 1; i <= 7; i++) {
    seed <<= 1;
    /* align%i_F%i */;
    failed |=
        probe(hash, *reference_values++, t1ha_test_pattern + i, 64 - i, seed);
  }

  uint8_t pattern_long[512];
  for (size_t i = 0; i < sizeof(pattern_long); ++i)
    pattern_long[i] = (uint8_t)i;
  for (int i = 0; i <= 7; i++) {
    /* long-%05i */
    failed |=
        probe(hash, *reference_values++, pattern_long + i, 128 + i * 17, seed);
  }

  return failed ? -1 : 0;
}


================================================
FILE: benchmarks/t1ha/src/t1ha_selfcheck.h
================================================
/*
 *  Copyright (c) 2016-2020 Positive Technologies, https://www.ptsecurity.com,
 *  Fast Positive Hash.
 *
 *  Portions Copyright (c) 2010-2020 Leonid Yuriev <leo@yuriev.ru>,
 *  The 1Hippeus project (t1h).
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty. In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgement in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 */

/*
 * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
 * by [Positive Technologies](https://www.ptsecurity.ru)
 *
 * Briefly, it is a 64-bit Hash Function:
 *  1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
 *     but portable and without penalties it can run on any 64-bit CPU.
 *  2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
 *     and all others portable hash-functions (which do not use specific
 *     hardware tricks).
 *  3. Not suitable for cryptography.
 *
 * The Future will (be) Positive. Всё будет хорошо.
 *
 * ACKNOWLEDGEMENT:
 * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
 * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
 */

#pragma once
#if defined(_MSC_VER) && _MSC_VER > 1800
#pragma warning(disable : 4464) /* relative include path contains '..' */
#endif                          /* MSVC */
#include "../t1ha.h"

/***************************************************************************/
/* Self-checking */

extern const uint8_t t1ha_test_pattern[64];
int t1ha_selfcheck(uint64_t (*hash)(const void *, size_t, uint64_t),
                   const uint64_t *reference_values);

#ifndef T1HA2_DISABLED
extern const uint64_t t1ha_refval_2atonce[81];
extern const uint64_t t1ha_refval_2atonce128[81];
extern const uint64_t t1ha_refval_2stream[81];
extern const uint64_t t1ha_refval_2stream128[81];
#endif /* T1HA2_DISABLED */

#ifndef T1HA1_DISABLED
extern const uint64_t t1ha_refval_64le[81];
extern const uint64_t t1ha_refval_64be[81];
#endif /* T1HA1_DISABLED */

#ifndef T1HA0_DISABLED
extern const uint64_t t1ha_refval_32le[81];
extern const uint64_t t1ha_refval_32be[81];
#if T1HA0_AESNI_AVAILABLE
extern const uint64_t t1ha_refval_ia32aes_a[81];
extern const uint64_t t1ha_refval_ia32aes_b[81];
#endif /* T1HA0_AESNI_AVAILABLE */
#endif /* T1HA0_DISABLED */


================================================
FILE: benchmarks/t1ha/src/t1ha_selfcheck_all.c
================================================
/*
 *  Copyright (c) 2016-2020 Positive Technologies, https://www.ptsecurity.com,
 *  Fast Positive Hash.
 *
 *  Portions Copyright (c) 2010-2020 Leonid Yuriev <leo@yuriev.ru>,
 *  The 1Hippeus project (t1h).
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty. In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgement in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 */

/*
 * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
 * by [Positive Technologies](https://www.ptsecurity.ru)
 *
 * Briefly, it is a 64-bit Hash Function:
 *  1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
 *     but portable and without penalties it can run on any 64-bit CPU.
 *  2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
 *     and all others portable hash-functions (which do not use specific
 *     hardware tricks).
 *  3. Not suitable for cryptography.
 *
 * The Future will (be) Positive. Всё будет хорошо.
 *
 * ACKNOWLEDGEMENT:
 * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
 * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
 */

#include "t1ha_bits.h"
#include "t1ha_selfcheck.h"

__cold int t1ha_selfcheck__all_enabled(void) {
  int rc = 0;

#ifndef T1HA2_DISABLED
  rc |= t1ha_selfcheck__t1ha2();
#endif /* T1HA2_DISABLED */

#ifndef T1HA1_DISABLED
  rc |= t1ha_selfcheck__t1ha1();
#endif /* T1HA1_DISABLED */

#ifndef T1HA0_DISABLED
  rc |= t1ha_selfcheck__t1ha0();
#endif /* T1HA0_DISABLED */

  return rc;
}


================================================
FILE: benchmarks/t1ha/t1ha.h
================================================
/*
 *  Copyright (c) 2016-2020 Positive Technologies, https://www.ptsecurity.com,
 *  Fast Positive Hash.
 *
 *  Portions Copyright (c) 2010-2020 Leonid Yuriev <leo@yuriev.ru>,
 *  The 1Hippeus project (t1h).
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty. In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgement in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 */

/*
 * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
 * by [Positive Technologies](https://www.ptsecurity.ru)
 *
 * Briefly, it is a 64-bit Hash Function:
 *  1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
 *     but portable and without penalties it can run on any 64-bit CPU.
 *  2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
 *     and all others portable hash-functions (which do not use specific
 *     hardware tricks).
 *  3. Not suitable for cryptography.
 *
 * The Future will (be) Positive. Всё будет хорошо.
 *
 * ACKNOWLEDGEMENT:
 * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
 * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
 */

#pragma once

/*****************************************************************************
 *
 * PLEASE PAY ATTENTION TO THE FOLLOWING NOTES
 * about macros definitions which controls t1ha behaviour and/or performance.
 *
 *
 * 1) T1HA_SYS_UNALIGNED_ACCESS = Defines the system/platform/CPU/architecture
 *                                abilities for unaligned data access.
 *
 *    By default, when the T1HA_SYS_UNALIGNED_ACCESS not defined,
 *    it will defined on the basis hardcoded knowledge about of capabilities
 *    of most common CPU architectures. But you could override this
 *    default behavior when build t1ha library itself:
 *
 *      // To disable unaligned access at all.
 *      #define T1HA_SYS_UNALIGNED_ACCESS 0
 *
 *      // To enable unaligned access, but indicate that it significantly slow.
 *      #define T1HA_SYS_UNALIGNED_ACCESS 1
 *
 *      // To enable unaligned access, and indicate that it effecient.
 *      #define T1HA_SYS_UNALIGNED_ACCESS 2
 *
 *
 * 2) T1HA_USE_FAST_ONESHOT_READ = Controls the data reads at the end of buffer.
 *
 *    When defined to non-zero, t1ha will use 'one shot' method for reading
 *    up to 8 bytes at the end of data. In this case just the one 64-bit read
 *    will be performed even when the available less than 8 bytes.
 *
 *    This is little bit faster that switching by length of data tail.
 *    Unfortunately this will triggering a false-positive alarms from Valgrind,
 *    AddressSanitizer and other similar tool.
 *
 *    By default, t1ha defines it to 1, but you could override this
 *    default behavior when build t1ha library itself:
 *
 *      // For little bit faster and small code.
 *      #define T1HA_USE_FAST_ONESHOT_READ 1
 *
 *      // For calmness if doubt.
 *      #define T1HA_USE_FAST_ONESHOT_READ 0
 *
 *
 * 3) T1HA0_RUNTIME_SELECT = Controls choice fastest function in runtime.
 *
 *    t1ha library offers the t1ha0() function as the fastest for current CPU.
 *    But actual CPU's features/capabilities and may be significantly different,
 *    especially on x86 platform. Therefore, internally, t1ha0() may require
 *    dynamic dispatching for choice best implementation.
 *
 *    By default, t1ha enables such runtime choice and (may be) corresponding
 *    indirect calls if it reasonable, but you could override this default
 *    behavior when build t1ha library itself:
 *
 *      // To enable runtime choice of fastest implementation.
 *      #define T1HA0_RUNTIME_SELECT 1
 *
 *      // To disable runtime choice of fastest implementation.
 *      #define T1HA0_RUNTIME_SELECT 0
 *
 *    When T1HA0_RUNTIME_SELECT is nonzero the t1ha0_resolve() function could
 *    be used to get actual t1ha0() implementation address at runtime. This is
 *    useful for two cases:
 *      - calling by local pointer-to-function usually is little
 *        bit faster (less overhead) than via a PLT thru the DSO boundary.
 *      - GNU Indirect functions (see below) don't supported by environment
 *        and calling by t1ha0_funcptr is not available and/or expensive.
 *
 * 4) T1HA_USE_INDIRECT_FUNCTIONS = Controls usage of GNU Indirect functions.
 *
 *    In continue of T1HA0_RUNTIME_SELECT the T1HA_USE_INDIRECT_FUNCTIONS
 *    controls usage of ELF indirect functions feature. In general, when
 *    available, this reduces overhead of indirect function's calls though
 *    a DSO-bundary (https://sourceware.org/glibc/wiki/GNU_IFUNC).
 *
 *    By default, t1ha engage GNU Indirect functions when it available
 *    and useful, but you could override this default behavior when build
 *    t1ha library itself:
 *
 *      // To enable use of GNU ELF Indirect functions.
 *      #define T1HA_USE_INDIRECT_FUNCTIONS 1
 *
 *      // To disable use of GNU ELF Indirect functions. This may be useful
 *      // if the actual toolchain or the system's loader don't support ones.
 *      #define T1HA_USE_INDIRECT_FUNCTIONS 0
 *
 * 5) T1HA0_AESNI_AVAILABLE = Controls AES-NI detection and dispatching on x86.
 *
 *    In continue of T1HA0_RUNTIME_SELECT the T1HA0_AESNI_AVAILABLE controls
 *    detection and usage of AES-NI CPU's feature. On the other hand, this
 *    requires compiling parts of t1ha library with certain properly options,
 *    and could be difficult or inconvenient in some cases.
 *
 *    By default, t1ha engade AES-NI for t1ha0() on the x86 platform, but
 *    you could override this default behavior when build t1ha library itself:
 *
 *      // To disable detection and usage of AES-NI instructions for t1ha0().
 *      // This may be useful when you unable to build t1ha library properly
 *      // or known that AES-NI will be unavailable at the deploy.
 *      #define T1HA0_AESNI_AVAILABLE 0
 *
 *      // To force detection and usage of AES-NI instructions for t1ha0(),
 *      // but I don't known reasons to anybody would need this.
 *      #define T1HA0_AESNI_AVAILABLE 1
 *
 * 6) T1HA0_DISABLED, T1HA1_DISABLED, T1HA2_DISABLED = Controls availability of
 *    t1ha functions.
 *
 *    In some cases could be useful to import/use only few of t1ha functions
 *    or just the one. So, this definitions allows disable corresponding parts
 *    of t1ha library.
 *
 *      // To disable t1ha0(), t1ha0_32le(), t1ha0_32be() and all AES-NI.
 *      #define T1HA0_DISABLED
 *
 *      // To disable t1ha1_le() and t1ha1_be().
 *      #define T1HA1_DISABLED
 *
 *      // To disable t1ha2_atonce(), t1ha2_atonce128() and so on.
 *      #define T1HA2_DISABLED
 *
 *****************************************************************************/

#define T1HA_VERSION_MAJOR 2
#define T1HA_VERSION_MINOR 1
#define T1HA_VERSION_RELEASE 1

#ifndef __has_attribute
#define __has_attribute(x) (0)
#endif

#ifndef __has_include
#define __has_include(x) (0)
#endif

#ifndef __GNUC_PREREQ
#if defined(__GNUC__) && defined(__GNUC_MINOR__)
#define __GNUC_PREREQ(maj, min)                                                \
  ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
#else
#define __GNUC_PREREQ(maj, min) 0
#endif
#endif /* __GNUC_PREREQ */

#ifndef __CLANG_PREREQ
#ifdef __clang__
#define __CLANG_PREREQ(maj, min)                                               \
  ((__clang_major__ << 16) + __clang_minor__ >= ((maj) << 16) + (min))
#else
#define __CLANG_PREREQ(maj, min) (0)
#endif
#endif /* __CLANG_PREREQ */

#ifndef __LCC_PREREQ
#ifdef __LCC__
#define __LCC_PREREQ(maj, min)                                                 \
  ((__LCC__ << 16) + __LCC_MINOR__ >= ((maj) << 16) + (min))
#else
#define __LCC_PREREQ(maj, min) (0)
#endif
#endif /* __LCC_PREREQ */

/*****************************************************************************/

#ifdef _MSC_VER
/* Avoid '16' bytes padding added after data member 't1ha_context::total'
 * and other warnings from std-headers if warning-level > 3. */
#pragma warning(push, 3)
#endif

#if defined(__cplusplus) && __cplusplus >= 201103L
#include <climits>
#include <cstddef>
#include <cstdint>
#else
#include <limits.h>
#include <stddef.h>
#include <stdint.h>
#endif

/*****************************************************************************/

#if defined(i386) || defined(__386) || defined(__i386) || defined(__i386__) || \
    defined(i486) || defined(__i486) || defined(__i486__) ||                   \
    defined(i586) | defined(__i586) || defined(__i586__) || defined(i686) ||   \
    defined(__i686) || defined(__i686__) || defined(_M_IX86) ||                \
    defined(_X86_) || defined(__THW_INTEL__) || defined(__I86__) ||            \
    defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) ||          \
    defined(__amd64__) || defined(__amd64) || defined(_M_X64) ||               \
    defined(_M_AMD64) || defined(__IA32__) || defined(__INTEL__)
#ifndef __ia32__
/* LY: define neutral __ia32__ for x86 and x86-64 archs */
#define __ia32__ 1
#endif /* __ia32__ */
#if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) ||        \
                            defined(__amd64) || defined(_M_X64))
/* LY: define trusty __amd64__ for all AMD64/x86-64 arch */
#define __amd64__ 1
#endif /* __amd64__ */
#endif /* all x86 */

#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) ||           \
    !defined(__ORDER_BIG_ENDIAN__)

/* *INDENT-OFF* */
/* clang-format off */

#if defined(__GLIBC__) || defined(__GNU_LIBRARY__) || defined(__ANDROID__) ||  \
    defined(HAVE_ENDIAN_H) || __has_include(<endian.h>)
#include <endian.h>
#elif defined(__APPLE__) || defined(__MACH__) || defined(__OpenBSD__) ||       \
    defined(HAVE_MACHINE_ENDIAN_H) || __has_include(<machine/endian.h>)
#include <machine/endian.h>
#elif defined(HAVE_SYS_ISA_DEFS_H) || __has_include(<sys/isa_defs.h>)
#include <sys/isa_defs.h>
#elif (defined(HAVE_SYS_TYPES_H) && defined(HAVE_SYS_ENDIAN_H)) ||             \
    (__has_include(<sys/types.h>) && __has_include(<sys/endian.h>))
#include <sys/endian.h>
#include <sys/types.h>
#elif defined(__bsdi__) || defined(__DragonFly__) || defined(__FreeBSD__) ||   \
    defined(__NETBSD__) || defined(__NetBSD__) ||                              \
    defined(HAVE_SYS_PARAM_H) || __has_include(<sys/param.h>)
#include <sys/param.h>
#endif /* OS */

/* *INDENT-ON* */
/* clang-format on */

#if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN)
#define __ORDER_LITTLE_ENDIAN__ __LITTLE_ENDIAN
#define __ORDER_BIG_ENDIAN__ __BIG_ENDIAN
#define __BYTE_ORDER__ __BYTE_ORDER
#elif defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)
#define __ORDER_LITTLE_ENDIAN__ _LITTLE_ENDIAN
#define __ORDER_BIG_ENDIAN__ _BIG_ENDIAN
#define __BYTE_ORDER__ _BYTE_ORDER
#else
#define __ORDER_LITTLE_ENDIAN__ 1234
#define __ORDER_BIG_ENDIAN__ 4321

#if defined(__LITTLE_ENDIAN__) ||                                              \
    (defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN)) ||                      \
    defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) ||    \
    defined(__MIPSEL__) || defined(_MIPSEL) || defined(__MIPSEL) ||            \
    defined(_M_ARM) || defined(_M_ARM64) || defined(__e2k__) ||                \
    defined(__elbrus_4c__) || defined(__elbrus_8c__) || defined(__bfin__) ||   \
    defined(__BFIN__) || defined(__ia64__) || defined(_IA64) ||                \
    defined(__IA64__) || defined(__ia64) || defined(_M_IA64) ||                \
    defined(__itanium__) || defined(__ia32__) || defined(__CYGWIN__) ||        \
    defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) ||              \
    defined(__WINDOWS__)
#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__

#elif defined(__BIG_ENDIAN__) ||                                               \
    (defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN)) ||                      \
    defined(__ARMEB__) || defined(__THUMBEB__) || defined(__AARCH64EB__) ||    \
    defined(__MIPSEB__) || defined(_MIPSEB) || defined(__MIPSEB) ||            \
    defined(__m68k__) || defined(M68000) || defined(__hppa__) ||               \
    defined(__hppa) || defined(__HPPA__) || defined(__sparc__) ||              \
    defined(__sparc) || defined(__370__) || defined(__THW_370__) ||            \
    defined(__s390__) || defined(__s390x__) || defined(__SYSC_ZARCH__)
#define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__

#else
#error __BYTE_ORDER__ should be defined.
#endif /* Arch */

#endif
#endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */

/*****************************************************************************/

#ifndef __dll_export
#if defined(_WIN32) || defined(_WIN64) || defined(__CYGWIN__)
#if defined(__GNUC__) || __has_attribute(dllexport)
#define __dll_export __attribute__((dllexport))
#else
#define __dll_export __declspec(dllexport)
#endif
#elif defined(__GNUC__) || __has_attribute(__visibility__)
#define __dll_export __attribute__((__visibility__("default")))
#else
#define __dll_export
#endif
#endif /* __dll_export */

#ifndef __dll_import
#if defined(_WIN32) || defined(_WIN64) || defined(__CYGWIN__)
#if defined(__GNUC__) || __has_attribute(dllimport)
#define __dll_import __attribute__((dllimport))
#else
#define __dll_import __declspec(dllimport)
#endif
#elif defined(__GNUC__) || __has_attribute(__visibility__)
#define __dll_import __attribute__((__visibility__("default")))
#else
#define __dll_import
#endif
#endif /* __dll_import */

#ifndef __force_inline
#ifdef _MSC_VER
#define __force_inline __forceinline
#elif __GNUC_PREREQ(3, 2) || __has_attribute(__always_inline__)
#define __force_inline __inline __attribute__((__always_inline__))
#else
#define __force_inline __inline
#endif
#endif /* __force_inline */

#ifndef T1HA_API
#if defined(t1ha_EXPORTS)
#define T1HA_API __dll_export
#elif defined(t1ha_IMPORTS)
#define T1HA_API __dll_import
#else
#define T1HA_API
#endif
#endif /* T1HA_API */

#if defined(_MSC_VER) && defined(__ia32__)
#define T1HA_ALIGN_PREFIX __declspec(align(32)) /* required only for SIMD */
#else
#define T1HA_ALIGN_PREFIX
#endif /* _MSC_VER */

#if defined(__GNUC__) && defined(__ia32__)
#define T1HA_ALIGN_SUFFIX                                                      \
  __attribute__((__aligned__(32))) /* required only for SIMD */
#else
#define T1HA_ALIGN_SUFFIX
#endif /* GCC x86 */

#ifndef T1HA_USE_INDIRECT_FUNCTIONS
/* GNU ELF indirect functions usage control. For more info please see
 * https://en.wikipedia.org/wiki/Executable_and_Linkable_Format
 * and https://sourceware.org/glibc/wiki/GNU_IFUNC */
#if defined(__ELF__) && defined(__amd64__) &&                                  \
    (__has_attribute(__ifunc__) ||                                             \
     (!defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 &&             \
      !defined(__SANITIZE_ADDRESS__) && !defined(__SSP_ALL__)))
/* Enable gnu_indirect_function by default if :
 *  - ELF AND x86_64
 *  - attribute(__ifunc__) is available OR
 *    GCC >= 4 WITHOUT -fsanitize=address NOR -fstack-protector-all */
#define T1HA_USE_INDIRECT_FUNCTIONS 1
#else
#define T1HA_USE_INDIRECT_FUNCTIONS 0
#endif
#endif /* T1HA_USE_INDIRECT_FUNCTIONS */

#if __GNUC_PREREQ(4, 0)
#pragma GCC visibility push(hidden)
#endif /* __GNUC_PREREQ(4,0) */

#ifdef __cplusplus
extern "C" {
#endif

typedef union T1HA_ALIGN_PREFIX t1ha_state256 {
  uint8_t bytes[32];
  uint32_t u32[8];
  uint64_t u64[4];
  struct {
    uint64_t a, b, c, d;
  } n;
} t1ha_state256_t T1HA_ALIGN_SUFFIX;

typedef struct t1ha_context {
  t1ha_state256_t state;
  t1ha_state256_t buffer;
  size_t partial;
  uint64_t total;
} t1ha_context_t;

#ifdef _MSC_VER
#pragma warning(pop)
#endif

/******************************************************************************
 *
 * Self-testing API.
 *
 * Unfortunately, some compilers (exactly only Microsoft Visual C/C++) has
 * a bugs which leads t1ha-functions to produce wrong results. This API allows
 * check the correctness of the actual code in runtime.
 *
 * All check-functions returns 0 on success, or -1 in case the corresponding
 * hash-function failed verification. PLEASE, always perform such checking at
 * initialization of your code, if you using MSVC or other troubleful compilers.
 */

T1HA_API int t1ha_selfcheck__all_enabled(void);

#ifndef T1HA2_DISABLED
T1HA_API int t1ha_selfcheck__t1ha2_atonce(void);
T1HA_API int t1ha_selfcheck__t1ha2_atonce128(void);
T1HA_API int t1ha_selfcheck__t1ha2_stream(void);
T1HA_API int t1ha_selfcheck__t1ha2(void);
#endif /* T1HA2_DISABLED */

#ifndef T1HA1_DISABLED
T1HA_API int t1ha_selfcheck__t1ha1_le(void);
T1HA_API int t1ha_selfcheck__t1ha1_be(void);
T1HA_API int t1ha_selfcheck__t1ha1(void);
#endif /* T1HA1_DISABLED */

#ifndef T1HA0_DISABLED
T1HA_API int t1ha_selfcheck__t1ha0_32le(void);
T1HA_API int t1ha_selfcheck__t1ha0_32be(void);
T1HA_API int t1ha_selfcheck__t1ha0(void);

/* Define T1HA0_AESNI_AVAILABLE to 0 for disable AES-NI support. */
#ifndef T1HA0_AESNI_AVAILABLE
#if defined(__e2k__) ||                                                        \
    (defined(__ia32__) && (!defined(_M_IX86) || _MSC_VER > 1800))
#define T1HA0_AESNI_AVAILABLE 1
#else
#define T1HA0_AESNI_AVAILABLE 0
#endif
#endif /* ifndef T1HA0_AESNI_AVAILABLE */

#if T1HA0_AESNI_AVAILABLE
T1HA_API int t1ha_selfcheck__t1ha0_ia32aes_noavx(void);
T1HA_API int t1ha_selfcheck__t1ha0_ia32aes_avx(void);
#ifndef __e2k__
T1HA_API int t1ha_selfcheck__t1ha0_ia32aes_avx2(void);
#endif
#endif /* if T1HA0_AESNI_AVAILABLE */
#endif /* T1HA0_DISABLED */

/******************************************************************************
 *
 *  t1ha2 = 64 and 128-bit, SLIGHTLY MORE ATTENTION FOR QUALITY AND STRENGTH.
 *
 *    - The recommended version of "Fast Positive Hash" with good quality
 *      for checksum, hash tables and fingerprinting.
 *    - Portable and extremely efficiency on modern 64-bit CPUs.
 *      Designed for 64-bit little-endian platforms,
 *      in other cases will runs slowly.
 *    - Great quality of hashing and still faster than other non-t1ha hashes.
 *      Provides streaming mode and 128-bit result.
 *
 * Note: Due performance reason 64- and 128-bit results are completely
 *       different each other, i.e. 64-bit result is NOT any part of 128-bit.
 */
#ifndef T1HA2_DISABLED

/* The at-once variant with 64-bit result */
T1HA_API uint64_t t1ha2_atonce(const void *data, size_t length, uint64_t seed);

/* The at-once variant with 128-bit result.
 * Argument `extra_result` is NOT optional and MUST be valid.
 * The high 64-bit part of 128-bit hash will be always unconditionally
 * stored to the address given by `extra_result` argument. */
T1HA_API uint64_t t1ha2_atonce128(uint64_t *__restrict extra_result,
                                  const void *__restrict data, size_t length,
                                  uint64_t seed);

/* The init/update/final trinity for streaming.
 * Return 64 or 128-bit result depentently from `extra_result` argument. */
T1HA_API void t1ha2_init(t1ha_context_t *ctx, uint64_t seed_x, uint64_t seed_y);
T1HA_API void t1ha2_update(t1ha_context_t *__restrict ctx,
                           const void *__restrict data, size_t length);

/* Argument `extra_result` is optional and MAY be NULL.
 *  - If `extra_result` is NOT NULL then the 128-bit hash will be calculated,
 *    and high 64-bit part of it will be stored to the address given
 *    by `extra_result` argument.
 *  - Otherwise the 64-bit hash will be calculated
 *    and returned from function directly.
 *
 * Note: Due performance reason 64- and 128-bit results are completely
 *       different each other, i.e. 64-bit result is NOT any part of 128-bit. */
T1HA_API uint64_t t1ha2_final(t1ha_context_t *__restrict ctx,
                              uint64_t *__restrict extra_result /* optional */);

#endif /* T1HA2_DISABLED */

/******************************************************************************
 *
 *  t1ha1 = 64-bit, BASELINE FAST PORTABLE HASH:
 *
 *    - Runs faster on 64-bit platforms in other cases may runs slowly.
 *    - Portable and stable, returns same 64-bit result
 *      on all architectures and CPUs.
 *    - Unfortunately it fails the "strict avalanche criteria",
 *      see test results at https://github.com/demerphq/smhasher.
 *
 *      This flaw is insignificant for the t1ha1() purposes and imperceptible
 *      from a practical point of view.
 *      However, nowadays this issue has resolved in the next t1ha2(),
 *      that was initially planned to providing a bit more quality.
 */
#ifndef T1HA1_DISABLED

/* The little-endian variant. */
T1HA_API uint64_t t1ha1_le(const void *data, size_t length, uint64_t seed);

/* The big-endian variant. */
T1HA_API uint64_t t1ha1_be(const void *data, size_t length, uint64_t seed);

#endif /* T1HA1_DISABLED */

/******************************************************************************
 *
 *  t1ha0 = 64-bit, JUST ONLY FASTER:
 *
 *    - Provides fast-as-possible hashing for current CPU, including
 *      32-bit systems and engaging the available hardware acceleration.
 *    - It is a facade that selects most quick-and-dirty hash
 *      for the current processor. For instance, on IA32 (x86) actual function
 *      will be selected in runtime, depending on current CPU capabilities
 *
 * BE CAREFUL!!!  THIS IS MEANS:
 *
 *   1. The quality of hash is a subject for tradeoffs with performance.
 *      So, the quality and strength of t1ha0() may be lower than t1ha1(),
 *      especially on 32-bit targets, but then much faster.
 *      However, guaranteed that it passes all SMHasher tests.
 *
 *   2. No warranty that the hash result will be same for particular
 *      key on another machine or another version of libt1ha.
 *
 *      Briefly, such hash-results and their derivatives, should be
 *      used only in runtime, but should not be persist or transferred
 *      over a network.
 *
 *
 *  When T1HA0_RUNTIME_SELECT is nonzero the t1ha0_resolve() function could
 *  be used to get actual t1ha0() implementation address at runtime. This is
 *  useful for two cases:
 *    - calling by local pointer-to-function usually is little
 *      bit faster (less overhead) than via a PLT thru the DSO boundary.
 *    - GNU Indirect functions (see below) don't supported by environment
 *      and calling by t1ha0_funcptr is not available and/or expensive.
 */

#ifndef T1HA0_DISABLED

/* The little-endian variant for 32-bit CPU. */
uint64_t t1ha0_32le(const void *data, size_t length, uint64_t seed);
/* The big-endian variant for 32-bit CPU. */
uint64_t t1ha0_32be(const void *data, size_t length, uint64_t seed);

/* Define T1HA0_AESNI_AVAILABLE to 0 for disable AES-NI support. */
#ifndef T1HA0_AESNI_AVAILABLE
#if defined(__e2k__) ||                                                        \
    (defined(__ia32__) && (!defined(_M_IX86) || _MSC_VER > 1800))
#define T1HA0_AESNI_AVAILABLE 1
#else
#define T1HA0_AESNI_AVAILABLE 0
#endif
#endif /* T1HA0_AESNI_AVAILABLE */

/* Define T1HA0_RUNTIME_SELECT to 0 for disable dispatching t1ha0 at runtime. */
#ifndef T1HA0_RUNTIME_SELECT
#if T1HA0_AESNI_AVAILABLE && !defined(__e2k__)
#define T1HA0_RUNTIME_SELECT 1
#else
#define T1HA0_RUNTIME_SELECT 0
#endif
#endif /* T1HA0_RUNTIME_SELECT */

#if !T1HA0_RUNTIME_SELECT && !defined(T1HA0_USE_DEFINE)
#if defined(__LCC__)
#define T1HA0_USE_DEFINE 1
#else
#define T1HA0_USE_DEFINE 0
#endif
#endif /* T1HA0_USE_DEFINE */

#if T1HA0_AESNI_AVAILABLE
uint64_t t1ha0_ia32aes_noavx(const void *data, size_t length, uint64_t seed);
uint64_t t1ha0_ia32aes_avx(const void *data, size_t length, uint64_t seed);
#ifndef __e2k__
uint64_t t1ha0_ia32aes_avx2(const void *data, size_t length, uint64_t seed);
#endif
#endif /* T1HA0_AESNI_AVAILABLE */

#if T1HA0_RUNTIME_SELECT
typedef uint64_t (*t1ha0_function_t)(const void *, size_t, uint64_t);
T1HA_API t1ha0_function_t t1ha0_resolve(void);
#if T1HA_USE_INDIRECT_FUNCTIONS
T1HA_API uint64_t t1ha0(const void *data, size_t length, uint64_t seed);
#else
/* Otherwise function pointer will be used.
 * Unfortunately this may cause some overhead calling. */
T1HA_API extern uint64_t (*t1ha0_funcptr)(const void *data, size_t length,
                                          uint64_t seed);
static __force_inline uint64_t t1ha0(const void *data, size_t length,
                                     uint64_t seed) {
  return t1ha0_funcptr(data, length, seed);
}
#endif /* T1HA_USE_INDIRECT_FUNCTIONS */

#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__

#if T1HA0_USE_DEFINE

#if (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) &&                \
    (!defined(T1HA1_DISABLED) || !defined(T1HA2_DISABLED))
#if defined(T1HA1_DISABLED)
#define t1ha0 t1ha2_atonce
#else
#define t1ha0 t1ha1_be
#endif /* T1HA1_DISABLED */
#else  /* 32/64 */
#define t1ha0 t1ha0_32be
#endif /* 32/64 */

#else /* T1HA0_USE_DEFINE */

static __force_inline uint64_t t1ha0(const void *data, size_t length,
                                     uint64_t seed) {
#if (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) &&                \
    (!defined(T1HA1_DISABLED) || !defined(T1HA2_DISABLED))
#if defined(T1HA1_DISABLED)
  return t1ha2_atonce(data, length, seed);
#else
  return t1ha1_be(data, length, seed);
#endif /* T1HA1_DISABLED */
#else  /* 32/64 */
  return t1ha0_32be(data, length, seed);
#endif /* 32/64 */
}

#endif /* !T1HA0_USE_DEFINE */

#else /* !T1HA0_RUNTIME_SELECT && __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */

#if T1HA0_USE_DEFINE

#if (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) &&                \
    (!defined(T1HA1_DISABLED) || !defined(T1HA2_DISABLED))
#if defined(T1HA1_DISABLED)
#define t1ha0 t1ha2_atonce
#else
#define t1ha0 t1ha1_le
#endif /* T1HA1_DISABLED */
#else  /* 32/64 */
#define t1ha0 t1ha0_32le
#endif /* 32/64 */

#else

static __force_inline uint64_t t1ha0(const void *data, size_t length,
                                     uint64_t seed) {
#if (UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul) &&                \
    (!defined(T1HA1_DISABLED) || !defined(T1HA2_DISABLED))
#if defined(T1HA1_DISABLED)
  return t1ha2_atonce(data, length, seed);
#else
  return t1ha1_le(data, length, seed);
#endif /* T1HA1_DISABLED */
#else  /* 32/64 */
  return t1ha0_32le(data, length, seed);
#endif /* 32/64 */
}

#endif /* !T1HA0_USE_DEFINE */

#endif /* !T1HA0_RUNTIME_SELECT */

#endif /* T1HA0_DISABLED */

#ifdef __cplusplus
}
#endif

#if __GNUC_PREREQ(4, 0)
#pragma GCC visibility pop
#endif /* __GNUC_PREREQ(4,0) */


================================================
FILE: benchmarks/ustd.h
================================================
/* ustd.h common macros and includes */
#ifndef LIBRHASH_USTD_H
#define LIBRHASH_USTD_H

#if _MSC_VER >= 1300

# define int64_t __int64
# define int32_t __int32
# define int16_t __int16
# define int8_t  __int8
# define uint64_t unsigned __int64
# define uint32_t unsigned __int32
# define uint16_t unsigned __int16
# define uint8_t  unsigned __int8

/* disable warnings: The POSIX name for this item is deprecated. Use the ISO C++ conformant name. */
#pragma warning(disable : 4996)

#else /* _MSC_VER >= 1300 */

# include <stdint.h>
# include <unistd.h>

#endif /* _MSC_VER >= 1300 */

#if _MSC_VER <= 1300
# include <stdlib.h> /* size_t for vc6.0 */
#endif /* _MSC_VER <= 1300 */

#endif /* LIBRHASH_USTD_H */


================================================
FILE: benchmarks/xoroshiro128plus.c
================================================
/*  Written in 2016 by David Blackman and Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>

/* This is the successor to xorshift128+. It is the fastest full-period
   generator passing BigCrush without systematic failures, but due to the
   relatively short period it is acceptable only for applications with a
   mild amount of parallelism; otherwise, use a xorshift1024* generator.

   Beside passing BigCrush, this generator passes the PractRand test suite
   up to (and included) 16TB, with the exception of binary rank tests,
   which fail due to the lowest bit being an LFSR; all other bits pass all
   tests. We suggest to use a sign test to extract a random Boolean value.
   
   Note that the generator uses a simulated rotate operation, which most C
   compilers will turn into a single instruction. In Java, you can use
   Long.rotateLeft(). In languages that do not make low-level rotation
   instructions accessible xorshift128+ could be faster.

   The state must be seeded so that it is not everywhere zero. If you have
   a 64-bit seed, we suggest to seed a splitmix64 generator and use its
   output to fill s. */

uint64_t s[2];

static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) | (x >> (64 - k));
}

uint64_t next(void) {
	const uint64_t s0 = s[0];
	uint64_t s1 = s[1];
	const uint64_t result = s0 + s1;

	s1 ^= s0;
	s[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14); // a, b
	s[1] = rotl(s1, 36); // c

	return result;
}


/* This is the jump function for the generator. It is equivalent
   to 2^64 calls to next(); it can be used to generate 2^64
   non-overlapping subsequences for parallel computations. */

void jump(void) {
	static const uint64_t JUMP[] = { 0xbeac0467eba5facb, 0xd86b048b86aa9922 };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (JUMP[i] & 1ULL << b) {
				s0 ^= s[0];
				s1 ^= s[1];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
}


================================================
FILE: benchmarks/xoroshiro128starstar.c
================================================
/*  Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>

/* This is xoroshiro128** 1.0, our all-purpose, rock-solid, small-state
   generator. It is extremely (sub-ns) fast and it passes all tests we are
   aware of, but its state space is large enough only for mild parallelism.

   For generating just floating-point numbers, xoroshiro128+ is even
   faster (but it has a very mild bias, see notes in the comments).

   The state must be seeded so that it is not everywhere zero. If you have
   a 64-bit seed, we suggest to seed a splitmix64 generator and use its
   output to fill s. */


static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) | (x >> (64 - k));
}


static uint64_t s[2];

uint64_t next(void) {
	const uint64_t s0 = s[0];
	uint64_t s1 = s[1];
	const uint64_t result = rotl(s0 * 5, 7) * 9;

	s1 ^= s0;
	s[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16); // a, b
	s[1] = rotl(s1, 37); // c

	return result;
}


/* This is the jump function for the generator. It is equivalent
   to 2^64 calls to next(); it can be used to generate 2^64
   non-overlapping subsequences for parallel computations. */

void jump(void) {
	static const uint64_t JUMP[] = { 0xdf900294d8f554a5, 0x170865df4b3201fc };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (JUMP[i] & UINT64_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
}


/* This is the long-jump function for the generator. It is equivalent to
   2^96 calls to next(); it can be used to generate 2^32 starting points,
   from each of which jump() will generate 2^32 non-overlapping
   subsequences for parallel distributed computations. */

void long_jump(void) {
	static const uint64_t LONG_JUMP[] = { 0xd2a98b26625eee7b, 0xdddf9b1090aa7ac1 };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (LONG_JUMP[i] & UINT64_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
			}
			next();
		}

	s[0] = s0;
	s[1] = s1;
}


================================================
FILE: benchmarks/xoseed.c
================================================
#include <stdlib.h>
#include <stdio.h>
#include "splitmix64.c"

void main (int argc, char *argv[]) {
  int n = atoi(argv[1]);
  for (int i = 0; i < n; i++)
    printf (" s[%d] = 0x%lx;", i, next());
  printf ("\n");
}


================================================
FILE: benchmarks/xoshiro256plus.c
================================================
/*  Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>

/* This is xoshiro256+ 1.0, our best and fastest generator for floating-point
   numbers. We suggest to use its upper bits for floating-point
   generation, as it is slightly faster than xoshiro256**. It passes all
   tests we are aware of except for the lowest three bits, which might
   fail linearity tests (and just those), so if low linear complexity is
   not considered an issue (as it is usually the case) it can be used to
   generate 64-bit outputs, too.

   We suggest to use a sign test to extract a random Boolean value, and
   right shifts to extract subsets of bits.

   The state must be seeded so that it is not everywhere zero. If you have
   a 64-bit seed, we suggest to seed a splitmix64 generator and use its
   output to fill s. */


static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) | (x >> (64 - k));
}


static uint64_t s[4];

uint64_t next(void) {
	const uint64_t result_plus = s[0] + s[3];

	const uint64_t t = s[1] << 17;

	s[2] ^= s[0];
	s[3] ^= s[1];
	s[1] ^= s[2];
	s[0] ^= s[3];

	s[2] ^= t;

	s[3] = rotl(s[3], 45);

	return result_plus;
}


/* This is the jump function for the generator. It is equivalent
   to 2^128 calls to next(); it can be used to generate 2^128
   non-overlapping subsequences for parallel computations. */

void jump(void) {
	static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	uint64_t s2 = 0;
	uint64_t s3 = 0;
	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (JUMP[i] & UINT64_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
				s2 ^= s[2];
				s3 ^= s[3];
			}
			next();	
		}
		
	s[0] = s0;
	s[1] = s1;
	s[2] = s2;
	s[3] = s3;
}


/* This is the long-jump function for the generator. It is equivalent to
   2^192 calls to next(); it can be used to generate 2^64 starting points,
   from each of which jump() will generate 2^64 non-overlapping
   subsequences for parallel distributed computations. */

void long_jump(void) {
	static const uint64_t LONG_JUMP[] = { 0x76e15d3efefdcbbf, 0xc5004e441c522fb3, 0x77710069854ee241, 0x39109bb02acbe635 };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	uint64_t s2 = 0;
	uint64_t s3 = 0;
	for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (LONG_JUMP[i] & UINT64_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
				s2 ^= s[2];
				s3 ^= s[3];
			}
			next();	
		}
		
	s[0] = s0;
	s[1] = s1;
	s[2] = s2;
	s[3] = s3;
}


================================================
FILE: benchmarks/xoshiro256starstar.c
================================================
/*  Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>

/* This is xoshiro256** 1.0, our all-purpose, rock-solid generator. It has
   excellent (sub-ns) speed, a state (256 bits) that is large enough for
   any parallel application, and it passes all tests we are aware of.

   For generating just floating-point numbers, xoshiro256+ is even faster.

   The state must be seeded so that it is not everywhere zero. If you have
   a 64-bit seed, we suggest to seed a splitmix64 generator and use its
   output to fill s. */

static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) | (x >> (64 - k));
}


static uint64_t s[4];

uint64_t next(void) {
	const uint64_t result_starstar = rotl(s[1] * 5, 7) * 9;

	const uint64_t t = s[1] << 17;

	s[2] ^= s[0];
	s[3] ^= s[1];
	s[1] ^= s[2];
	s[0] ^= s[3];

	s[2] ^= t;

	s[3] = rotl(s[3], 45);

	return result_starstar;
}


/* This is the jump function for the generator. It is equivalent
   to 2^128 calls to next(); it can be used to generate 2^128
   non-overlapping subsequences for parallel computations. */

void jump(void) {
	static const uint64_t JUMP[] = { 0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	uint64_t s2 = 0;
	uint64_t s3 = 0;
	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (JUMP[i] & UINT64_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
				s2 ^= s[2];
				s3 ^= s[3];
			}
			next();	
		}
		
	s[0] = s0;
	s[1] = s1;
	s[2] = s2;
	s[3] = s3;
}


/* This is the long-jump function for the generator. It is equivalent to
   2^192 calls to next(); it can be used to generate 2^64 starting points,
   from each of which jump() will generate 2^64 non-overlapping
   subsequences for parallel distributed computations. */

void long_jump(void) {
	static const uint64_t LONG_JUMP[] = { 0x76e15d3efefdcbbf, 0xc5004e441c522fb3, 0x77710069854ee241, 0x39109bb02acbe635 };

	uint64_t s0 = 0;
	uint64_t s1 = 0;
	uint64_t s2 = 0;
	uint64_t s3 = 0;
	for(int i = 0; i < sizeof LONG_JUMP / sizeof *LONG_JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (LONG_JUMP[i] & UINT64_C(1) << b) {
				s0 ^= s[0];
				s1 ^= s[1];
				s2 ^= s[2];
				s3 ^= s[3];
			}
			next();	
		}
		
	s[0] = s0;
	s[1] = s1;
	s[2] = s2;
	s[3] = s3;
}


================================================
FILE: benchmarks/xoshiro512plus.c
================================================
/*  Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>
#include <string.h>

/* This is xoshiro512+ 1.0, our generator for floating-point numbers with
   increased state size. We suggest to use its upper bits for
   floating-point generation, as it is slightly faster than xoshiro512**.
   It passes all tests we are aware of except for the lowest three bits,
   which might fail linearity tests (and just those), so if low linear
   complexity is not considered an issue (as it is usually the case) it
   can be used to generate 64-bit outputs, too.

   We suggest to use a sign test to extract a random Boolean value, and
   right shifts to extract subsets of bits.

   The state must be seeded so that it is not everywhere zero. If you have
   a 64-bit seed, we suggest to seed a splitmix64 generator and use its
   output to fill s. */

static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) | (x >> (64 - k));
}


static uint64_t s[8];

uint64_t next(void) {
	const uint64_t result_plus = s[0] + s[2];

	const uint64_t t = s[1] << 11;

	s[2] ^= s[0];
	s[5] ^= s[1];
	s[1] ^= s[2];
	s[7] ^= s[3];
	s[3] ^= s[4];
	s[4] ^= s[5];
	s[0] ^= s[6];
	s[6] ^= s[7];

	s[6] ^= t;

	s[7] = rotl(s[7], 21);

	return result_plus;
}


/* This is the jump function for the generator. It is equivalent
   to 2^256 calls to next(); it can be used to generate 2^256
   non-overlapping subsequences for parallel computations. */

void jump(void) {
	static const uint64_t JUMP[] = { 0x33ed89b6e7a353f9, 0x760083d7955323be, 0x2837f2fbb5f22fae, 0x4b8c5674d309511c, 0xb11ac47a7ba28c25, 0xf1be7667092bcc1c, 0x53851efdb6df0aaf, 0x1ebbc8b23eaf25db };

	uint64_t t[sizeof s / sizeof *s];
	memset(t, 0, sizeof t);
	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (JUMP[i] & UINT64_C(1) << b)
				for(int w = 0; w < sizeof s / sizeof *s; w++)
					t[w] ^= s[w];
			next();
		}
	
	memcpy(s, t, sizeof s);	
}


================================================
FILE: benchmarks/xoshiro512starstar.c
================================================
/*  Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org)

To the extent possible under law, the author has dedicated all copyright
and related and neighboring rights to this software to the public domain
worldwide. This software is distributed without any warranty.

See <http://creativecommons.org/publicdomain/zero/1.0/>. */

#include <stdint.h>
#include <string.h>

/* This is xoshiro512** 1.0, an all-purpose, rock-solid generator. It has
   excellent (about 1ns) speed, an increased state (512 bits) that is
   large enough for any parallel application, and it passes all tests we
   are aware of.

   For generating just floating-point numbers, xoshiro512+ is even faster.

   The state must be seeded so that it is not everywhere zero. If you have
   a 64-bit seed, we suggest to seed a splitmix64 generator and use its
   output to fill s. */

static inline uint64_t rotl(const uint64_t x, int k) {
	return (x << k) | (x >> (64 - k));
}


static uint64_t s[8];

uint64_t next(void) {
	const uint64_t result_starstar = rotl(s[1] * 5, 7) * 9;

	const uint64_t t = s[1] << 11;

	s[2] ^= s[0];
	s[5] ^= s[1];
	s[1] ^= s[2];
	s[7] ^= s[3];
	s[3] ^= s[4];
	s[4] ^= s[5];
	s[0] ^= s[6];
	s[6] ^= s[7];

	s[6] ^= t;

	s[7] = rotl(s[7], 21);

	return result_starstar;
}


/* This is the jump function for the generator. It is equivalent
   to 2^256 calls to next(); it can be used to generate 2^256
   non-overlapping subsequences for parallel computations. */

void jump(void) {
	static const uint64_t JUMP[] = { 0x33ed89b6e7a353f9, 0x760083d7955323be, 0x2837f2fbb5f22fae, 0x4b8c5674d309511c, 0xb11ac47a7ba28c25, 0xf1be7667092bcc1c, 0x53851efdb6df0aaf, 0x1ebbc8b23eaf25db };

	uint64_t t[sizeof s / sizeof *s];
	memset(t, 0, sizeof t);
	for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
		for(int b = 0; b < 64; b++) {
			if (JUMP[i] & UINT64_C(1) << b)
				for(int w = 0; w < sizeof s / sizeof *s; w++)
					t[w] ^= s[w];
			next();
		}
	
	memcpy(s, t, sizeof s);	
}


================================================
FILE: benchmarks/xxh3.h
================================================
/*
 * xxHash - Extremely Fast Hash algorithm
 * Development source file for `xxh3`
 * Copyright (C) 2019-2021 Yann Collet
 *
 * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 *    * Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *    * Redistributions in binary form must reproduce the above
 *      copyright notice, this list of conditions and the following disclaimer
 *      in the documentation and/or other materials provided with the
 *      distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * You can contact the author at:
 *   - xxHash homepage: https://www.xxhash.com
 *   - xxHash source repository: https://github.com/Cyan4973/xxHash
 */

/*
 * Note: This file used to host the source code of XXH3_* variants.
 * during the development period.
 * The source code is now properly integrated within xxhash.h.
 *
 * xxh3.h is no longer useful,
 * but it is still provided for compatibility with source code
 * which used to include it directly.
 *
 * Programs are now highly discouraged to include xxh3.h.
 * Include `xxhash.h` instead, which is the officially supported interface.
 *
 * In the future, xxh3.h will start to generate warnings, then errors,
 * then it will be removed from source package and from include directory.
 */

/* Simulate the same impact as including the old xxh3.h source file */

#define XXH_INLINE_ALL
#include "xxhash.h"


================================================
FILE: benchmarks/xxhash.c
================================================
/*
 * xxHash - Extremely Fast Hash algorithm
 * Copyright (C) 2012-2023 Yann Collet
 *
 * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 *    * Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *    * Redistributions in binary form must reproduce the above
 *      copyright notice, this list of conditions and the following disclaimer
 *      in the documentation and/or other materials provided with the
 *      distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * You can contact the author at:
 *   - xxHash homepage: https://www.xxhash.com
 *   - xxHash source repository: https://github.com/Cyan4973/xxHash
 */

/*
 * xxhash.c instantiates functions defined in xxhash.h
 */

#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */
#define XXH_IMPLEMENTATION      /* access definitions */

#include "xxhash.h"


================================================
FILE: benchmarks/xxhash.h
================================================
/*
 * xxHash - Extremely Fast Hash algorithm
 * Header File
 * Copyright (C) 2012-2023 Yann Collet
 *
 * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 *    * Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *    * Redistributions in binary form must reproduce the above
 *      copyright notice, this list of conditions and the following disclaimer
 *      in the documentation and/or other materials provided with the
 *      distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * You can contact the author at:
 *   - xxHash homepage: https://www.xxhash.com
 *   - xxHash source repository: https://github.com/Cyan4973/xxHash
 */

/*!
 * @mainpage xxHash
 *
 * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed
 * limits.
 *
 * It is proposed in four flavors, in three families:
 * 1. @ref XXH32_family
 *   - Classic 32-bit hash function. Simple, compact, and runs on almost all
 *     32-bit and 64-bit systems.
 * 2. @ref XXH64_family
 *   - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most
 *     64-bit systems (but _not_ 32-bit systems).
 * 3. @ref XXH3_family
 *   - Modern 64-bit and 128-bit hash function family which features improved
 *     strength and performance across the board, especially on smaller data.
 *     It benefits greatly from SIMD and 64-bit without requiring it.
 *
 * Benchmarks
 * ---
 * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04.
 * The open source benchmark program is compiled with clang v10.0 using -O3 flag.
 *
 * | Hash Name            | ISA ext | Width | Large Data Speed | Small Data Velocity |
 * | -------------------- | ------- | ----: | ---------------: | ------------------: |
 * | XXH3_64bits()        | @b AVX2 |    64 |        59.4 GB/s |               133.1 |
 * | MeowHash             | AES-NI  |   128 |        58.2 GB/s |                52.5 |
 * | XXH3_128bits()       | @b AVX2 |   128 |        57.9 GB/s |               118.1 |
 * | CLHash               | PCLMUL  |    64 |        37.1 GB/s |                58.1 |
 * | XXH3_64bits()        | @b SSE2 |    64 |        31.5 GB/s |               133.1 |
 * | XXH3_128bits()       | @b SSE2 |   128 |        29.6 GB/s |               118.1 |
 * | RAM sequential read  |         |   N/A |        28.0 GB/s |                 N/A |
 * | ahash                | AES-NI  |    64 |        22.5 GB/s |               107.2 |
 * | City64               |         |    64 |        22.0 GB/s |                76.6 |
 * | T1ha2                |         |    64 |        22.0 GB/s |                99.0 |
 * | City128              |         |   128 |        21.7 GB/s |                57.7 |
 * | FarmHash             | AES-NI  |    64 |        21.3 GB/s |                71.9 |
 * | XXH64()              |         |    64 |        19.4 GB/s |                71.0 |
 * | SpookyHash           |         |    64 |        19.3 GB/s |                53.2 |
 * | Mum                  |         |    64 |        18.0 GB/s |                67.0 |
 * | CRC32C               | SSE4.2  |    32 |        13.0 GB/s |                57.9 |
 * | XXH32()              |         |    32 |         9.7 GB/s |                71.9 |
 * | City32               |         |    32 |         9.1 GB/s |                66.0 |
 * | Blake3*              | @b AVX2 |   256 |         4.4 GB/s |                 8.1 |
 * | Murmur3              |         |    32 |         3.9 GB/s |                56.1 |
 * | SipHash*             |         |    64 |         3.0 GB/s |                43.2 |
 * | Blake3*              | @b SSE2 |   256 |         2.4 GB/s |                 8.1 |
 * | HighwayHash          |         |    64 |         1.4 GB/s |                 6.0 |
 * | FNV64                |         |    64 |         1.2 GB/s |                62.7 |
 * | Blake2*              |         |   256 |         1.1 GB/s |                 5.1 |
 * | SHA1*                |         |   160 |         0.8 GB/s |                 5.6 |
 * | MD5*                 |         |   128 |         0.6 GB/s |                 7.8 |
 * @note
 *   - Hashes which require a specific ISA extension are noted. SSE2 is also noted,
 *     even though it is mandatory on x64.
 *   - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic
 *     by modern standards.
 *   - Small data velocity is a rough average of algorithm's efficiency for small
 *     data. For more accurate information, see the wiki.
 *   - More benchmarks and strength tests are found on the wiki:
 *         https://github.com/Cyan4973/xxHash/wiki
 *
 * Usage
 * ------
 * All xxHash variants use a similar API. Changing the algorithm is a trivial
 * substitution.
 *
 * @pre
 *    For functions which take an input and length parameter, the following
 *    requirements are assumed:
 *    - The range from [`input`, `input + length`) is valid, readable memory.
 *      - The only exception is if the `length` is `0`, `input` may be `NULL`.
 *    - For C++, the objects must have the *TriviallyCopyable* property, as the
 *      functions access bytes directly as if it was an array of `unsigned char`.
 *
 * @anchor single_shot_example
 * **Single Shot**
 *
 * These functions are stateless functions which hash a contiguous block of memory,
 * immediately returning the result. They are the easiest and usually the fastest
 * option.
 *
 * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits()
 *
 * @code{.c}
 *   #include <string.h>
 *   #include "xxhash.h"
 *
 *   // Example for a function which hashes a null terminated string with XXH32().
 *   XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed)
 *   {
 *       // NULL pointers are only valid if the length is zero
 *       size_t length = (string == NULL) ? 0 : strlen(string);
 *       return XXH32(string, length, seed);
 *   }
 * @endcode
 *
 *
 * @anchor streaming_example
 * **Streaming**
 *
 * These groups of functions allow incremental hashing of unknown size, even
 * more than what would fit in a size_t.
 *
 * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset()
 *
 * @code{.c}
 *   #include <stdio.h>
 *   #include <assert.h>
 *   #include "xxhash.h"
 *   // Example for a function which hashes a FILE incrementally with XXH3_64bits().
 *   XXH64_hash_t hashFile(FILE* f)
 *   {
 *       // Allocate a state struct. Do not just use malloc() or new.
 *       XXH3_state_t* state = XXH3_createState();
 *       assert(state != NULL && "Out of memory!");
 *       // Reset the state to start a new hashing session.
 *       XXH3_64bits_reset(state);
 *       char buffer[4096];
 *       size_t count;
 *       // Read the file in chunks
 *       while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) {
 *           // Run update() as many times as necessary to process the data
 *           XXH3_64bits_update(state, buffer, count);
 *       }
 *       // Retrieve the finalized hash. This will not change the state.
 *       XXH64_hash_t result = XXH3_64bits_digest(state);
 *       // Free the state. Do not use free().
 *       XXH3_freeState(state);
 *       return result;
 *   }
 * @endcode
 *
 * Streaming functions generate the xxHash value from an incremental input.
 * This method is slower than single-call functions, due to state management.
 * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
 *
 * An XXH state must first be allocated using `XXH*_createState()`.
 *
 * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
 *
 * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
 *
 * The function returns an error code, with 0 meaning OK, and any other value
 * meaning there is an error.
 *
 * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
 * This function returns the nn-bits hash as an int or long long.
 *
 * It's still possible to continue inserting input into the hash state after a
 * digest, and generate new hash values later on by invoking `XXH*_digest()`.
 *
 * When done, release the state using `XXH*_freeState()`.
 *
 *
 * @anchor canonical_representation_example
 * **Canonical Representation**
 *
 * The default return values from XXH functions are unsigned 32, 64 and 128 bit
 * integers.
 * This the simplest and fastest format for further post-processing.
 *
 * However, this leaves open the question of what is the order on the byte level,
 * since little and big endian conventions will store the same number differently.
 *
 * The canonical representation settles this issue by mandating big-endian
 * convention, the same convention as human-readable numbers (large digits first).
 *
 * When writing hash values to storage, sending them over a network, or printing
 * them, it's highly recommended to use the canonical representation to ensure
 * portability across a wider range of systems, present and future.
 *
 * The following functions allow transformation of hash values to and from
 * canonical format.
 *
 * XXH32_canonicalFromHash(), XXH32_hashFromCanonical(),
 * XXH64_canonicalFromHash(), XXH64_hashFromCanonical(),
 * XXH128_canonicalFromHash(), XXH128_hashFromCanonical(),
 *
 * @code{.c}
 *   #include <stdio.h>
 *   #include "xxhash.h"
 *
 *   // Example for a function which prints XXH32_hash_t in human readable format
 *   void printXxh32(XXH32_hash_t hash)
 *   {
 *       XXH32_canonical_t cano;
 *       XXH32_canonicalFromHash(&cano, hash);
 *       size_t i;
 *       for(i = 0; i < sizeof(cano.digest); ++i) {
 *           printf("%02x", cano.digest[i]);
 *       }
 *       printf("\n");
 *   }
 *
 *   // Example for a function which converts XXH32_canonical_t to XXH32_hash_t
 *   XXH32_hash_t convertCanonicalToXxh32(XXH32_canonical_t cano)
 *   {
 *       XXH32_hash_t hash = XXH32_hashFromCanonical(&cano);
 *       return hash;
 *   }
 * @endcode
 *
 *
 * @file xxhash.h
 * xxHash prototypes and implementation
 */

#if defined (__cplusplus)
extern "C" {
#endif

/* ****************************
 *  INLINE mode
 ******************************/
/*!
 * @defgroup public Public API
 * Contains details on the public xxHash functions.
 * @{
 */
#ifdef XXH_DOXYGEN
/*!
 * @brief Gives access to internal state declaration, required for static allocation.
 *
 * Incompatible with dynamic linking, due to risks of ABI changes.
 *
 * Usage:
 * @code{.c}
 *     #define XXH_STATIC_LINKING_ONLY
 *     #include "xxhash.h"
 * @endcode
 */
#  define XXH_STATIC_LINKING_ONLY
/* Do not undef XXH_STATIC_LINKING_ONLY for Doxygen */

/*!
 * @brief Gives access to internal definitions.
 *
 * Usage:
 * @code{.c}
 *     #define XXH_STATIC_LINKING_ONLY
 *     #define XXH_IMPLEMENTATION
 *     #include "xxhash.h"
 * @endcode
 */
#  define XXH_IMPLEMENTATION
/* Do not undef XXH_IMPLEMENTATION for Doxygen */

/*!
 * @brief Exposes the implementation and marks all functions as `inline`.
 *
 * Use these build macros to inline xxhash into the target unit.
 * Inlining improves performance on small inputs, especially when the length is
 * expressed as a compile-time constant:
 *
 *  https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
 *
 * It also keeps xxHash symbols private to the unit, so they are not exported.
 *
 * Usage:
 * @code{.c}
 *     #define XXH_INLINE_ALL
 *     #include "xxhash.h"
 * @endcode
 * Do not compile and link xxhash.o as a separate object, as it is not useful.
 */
#  define XXH_INLINE_ALL
#  undef XXH_INLINE_ALL
/*!
 * @brief Exposes the implementation without marking functions as inline.
 */
#  define XXH_PRIVATE_API
#  undef XXH_PRIVATE_API
/*!
 * @brief Emulate a namespace by transparently prefixing all symbols.
 *
 * If you want to include _and expose_ xxHash functions from within your own
 * library, but also want to avoid symbol collisions with other libraries which
 * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix
 * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE
 * (therefore, avoid empty or numeric values).
 *
 * Note that no change is required within the calling program as long as it
 * includes `xxhash.h`: Regular symbol names will be automatically translated
 * by this header.
 */
#  define XXH_NAMESPACE /* YOUR NAME HERE */
#  undef XXH_NAMESPACE
#endif

#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \
    && !defined(XXH_INLINE_ALL_31684351384)
   /* this section should be traversed only once */
#  define XXH_INLINE_ALL_31684351384
   /* give access to the advanced API, required to compile implementations */
#  undef XXH_STATIC_LINKING_ONLY   /* avoid macro redef */
#  define XXH_STATIC_LINKING_ONLY
   /* make all functions private */
#  undef XXH_PUBLIC_API
#  if defined(__GNUC__)
#    define XXH_PUBLIC_API static __inline __attribute__((__unused__))
#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
#    define XXH_PUBLIC_API static inline
#  elif defined(_MSC_VER)
#    define XXH_PUBLIC_API static __inline
#  else
     /* note: this version may generate warnings for unused static functions */
#    define XXH_PUBLIC_API static
#  endif

   /*
    * This part deals with the special case where a unit wants to inline xxHash,
    * but "xxhash.h" has previously been included without XXH_INLINE_ALL,
    * such as part of some previously included *.h header file.
    * Without further action, the new include would just be ignored,
    * and functions would effectively _not_ be inlined (silent failure).
    * The following macros solve this situation by prefixing all inlined names,
    * avoiding naming collision with previous inclusions.
    */
   /* Before that, we unconditionally #undef all symbols,
    * in case they were already defined with XXH_NAMESPACE.
    * They will then be redefined for XXH_INLINE_ALL
    */
#  undef XXH_versionNumber
    /* XXH32 */
#  undef XXH32
#  undef XXH32_createState
#  undef XXH32_freeState
#  undef XXH32_reset
#  undef XXH32_update
#  undef XXH32_digest
#  undef XXH32_copyState
#  undef XXH32_canonicalFromHash
#  undef XXH32_hashFromCanonical
    /* XXH64 */
#  undef XXH64
#  undef XXH64_createState
#  undef XXH64_freeState
#  undef XXH64_reset
#  undef XXH64_update
#  undef XXH64_digest
#  undef XXH64_copyState
#  undef XXH64_canonicalFromHash
#  undef XXH64_hashFromCanonical
    /* XXH3_64bits */
#  undef XXH3_64bits
#  undef XXH3_64bits_withSecret
#  undef XXH3_64bits_withSeed
#  undef XXH3_64bits_withSecretandSeed
#  undef XXH3_createState
#  undef XXH3_freeState
#  undef XXH3_copyState
#  undef XXH3_64bits_reset
#  undef XXH3_64bits_reset_withSeed
#  undef XXH3_64bits_reset_withSecret
#  undef XXH3_64bits_update
#  undef XXH3_64bits_digest
#  undef XXH3_generateSecret
    /* XXH3_128bits */
#  undef XXH128
#  undef XXH3_128bits
#  undef XXH3_128bits_withSeed
#  undef XXH3_128bits_withSecret
#  undef XXH3_128bits_reset
#  undef XXH3_128bits_reset_withSeed
#  undef XXH3_128bits_reset_withSecret
#  undef XXH3_128bits_reset_withSecretandSeed
#  undef XXH3_128bits_update
#  undef XXH3_128bits_digest
#  undef XXH128_isEqual
#  undef XXH128_cmp
#  undef XXH128_canonicalFromHash
#  undef XXH128_hashFromCanonical
    /* Finally, free the namespace itself */
#  undef XXH_NAMESPACE

    /* employ the namespace for XXH_INLINE_ALL */
#  define XXH_NAMESPACE XXH_INLINE_
   /*
    * Some identifiers (enums, type names) are not symbols,
    * but they must nonetheless be renamed to avoid redeclaration.
    * Alternative solution: do not redeclare them.
    * However, this requires some #ifdefs, and has a more dispersed impact.
    * Meanwhile, renaming can be achieved in a single place.
    */
#  define XXH_IPREF(Id)   XXH_NAMESPACE ## Id
#  define XXH_OK XXH_IPREF(XXH_OK)
#  define XXH_ERROR XXH_IPREF(XXH_ERROR)
#  define XXH_errorcode XXH_IPREF(XXH_errorcode)
#  define XXH32_canonical_t  XXH_IPREF(XXH32_canonical_t)
#  define XXH64_canonical_t  XXH_IPREF(XXH64_canonical_t)
#  define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
#  define XXH32_state_s XXH_IPREF(XXH32_state_s)
#  define XXH32_state_t XXH_IPREF(XXH32_state_t)
#  define XXH64_state_s XXH_IPREF(XXH64_state_s)
#  define XXH64_state_t XXH_IPREF(XXH64_state_t)
#  define XXH3_state_s  XXH_IPREF(XXH3_state_s)
#  define XXH3_state_t  XXH_IPREF(XXH3_state_t)
#  define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
   /* Ensure the header is parsed again, even if it was previously included */
#  undef XXHASH_H_5627135585666179
#  undef XXHASH_H_STATIC_13879238742
#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */

/* ****************************************************************
 *  Stable API
 *****************************************************************/
#ifndef XXHASH_H_5627135585666179
#define XXHASH_H_5627135585666179 1

/*! @brief Marks a global symbol. */
#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
#  if defined(_WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
#    ifdef XXH_EXPORT
#      define XXH_PUBLIC_API __declspec(dllexport)
#    elif XXH_IMPORT
#      define XXH_PUBLIC_API __declspec(dllimport)
#    endif
#  else
#    define XXH_PUBLIC_API   /* do nothing */
#  endif
#endif

#ifdef XXH_NAMESPACE
#  define XXH_CAT(A,B) A##B
#  define XXH_NAME2(A,B) XXH_CAT(A,B)
#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
/* XXH32 */
#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
/* XXH64 */
#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
/* XXH3_64bits */
#  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
#  define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
#  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
#  define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed)
#  define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
#  define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
#  define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
#  define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
#  define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
#  define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
#  define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed)
#  define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
#  define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
#  define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
#  define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed)
/* XXH3_128bits */
#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
#  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
#  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
#  define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
#  define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed)
#  define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
#  define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
#  define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
#  define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed)
#  define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
#  define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
#endif


/* *************************************
*  Compiler specifics
***************************************/

/* specific declaration modes for Windows */
#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
#  if defined(_WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
#    ifdef XXH_EXPORT
#      define XXH_PUBLIC_API __declspec(dllexport)
#    elif XXH_IMPORT
#      define XXH_PUBLIC_API __declspec(dllimport)
#    endif
#  else
#    define XXH_PUBLIC_API   /* do nothing */
#  endif
#endif

#if defined (__GNUC__)
# define XXH_CONSTF  __attribute__((__const__))
# define XXH_PUREF   __attribute__((__pure__))
# define XXH_MALLOCF __attribute__((__malloc__))
#else
# define XXH_CONSTF  /* disable */
# define XXH_PUREF
# define XXH_MALLOCF
#endif

/* *************************************
*  Version
***************************************/
#define XXH_VERSION_MAJOR    0
#define XXH_VERSION_MINOR    8
#define XXH_VERSION_RELEASE  3
/*! @brief Version number, encoded as two digits each */
#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)

/*!
 * @brief Obtains the xxHash version.
 *
 * This is mostly useful when xxHash is compiled as a shared library,
 * since the returned value comes from the library, as opposed to header file.
 *
 * @return @ref XXH_VERSION_NUMBER of the invoked library.
 */
XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void);


/* ****************************
*  Common basic types
******************************/
#include <stddef.h>   /* size_t */
/*!
 * @brief Exit code for the streaming API.
 */
typedef enum {
    XXH_OK = 0, /*!< OK */
    XXH_ERROR   /*!< Error */
} XXH_errorcode;


/*-**********************************************************************
*  32-bit hash
************************************************************************/
#if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */
/*!
 * @brief An unsigned 32-bit integer.
 *
 * Not necessarily defined to `uint32_t` but functionally equivalent.
 */
typedef uint32_t XXH32_hash_t;

#elif !defined (__VMS) \
  && (defined (__cplusplus) \
  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
#   ifdef _AIX
#     include <inttypes.h>
#   else
#     include <stdint.h>
#   endif
    typedef uint32_t XXH32_hash_t;

#else
#   include <limits.h>
#   if UINT_MAX == 0xFFFFFFFFUL
      typedef unsigned int XXH32_hash_t;
#   elif ULONG_MAX == 0xFFFFFFFFUL
      typedef unsigned long XXH32_hash_t;
#   else
#     error "unsupported platform: need a 32-bit type"
#   endif
#endif

/*!
 * @}
 *
 * @defgroup XXH32_family XXH32 family
 * @ingroup public
 * Contains functions used in the classic 32-bit xxHash algorithm.
 *
 * @note
 *   XXH32 is useful for older platforms, with no or poor 64-bit performance.
 *   Note that the @ref XXH3_family provides competitive speed for both 32-bit
 *   and 64-bit systems, and offers true 64/128 bit hash results.
 *
 * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families
 * @see @ref XXH32_impl for implementation details
 * @{
 */

/*!
 * @brief Calculates the 32-bit hash of @p input using xxHash32.
 *
 * @param input The block of data to be hashed, at least @p length bytes in size.
 * @param length The length of @p input, in bytes.
 * @param seed The 32-bit seed to alter the hash's output predictably.
 *
 * @pre
 *   The memory between @p input and @p input + @p length must be valid,
 *   readable, contiguous memory. However, if @p length is `0`, @p input may be
 *   `NULL`. In C++, this also must be *TriviallyCopyable*.
 *
 * @return The calculated 32-bit xxHash32 value.
 *
 * @see @ref single_shot_example "Single Shot Example" for an example.
 */
XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);

#ifndef XXH_NO_STREAM
/*!
 * @typedef struct XXH32_state_s XXH32_state_t
 * @brief The opaque state struct for the XXH32 streaming API.
 *
 * @see XXH32_state_s for details.
 * @see @ref streaming_example "Streaming Example"
 */
typedef struct XXH32_state_s XXH32_state_t;

/*!
 * @brief Allocates an @ref XXH32_state_t.
 *
 * @return An allocated pointer of @ref XXH32_state_t on success.
 * @return `NULL` on failure.
 *
 * @note Must be freed with XXH32_freeState().
 *
 * @see @ref streaming_example "Streaming Example"
 */
XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void);
/*!
 * @brief Frees an @ref XXH32_state_t.
 *
 * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState().
 *
 * @return @ref XXH_OK.
 *
 * @note @p statePtr must be allocated with XXH32_createState().
 *
 * @see @ref streaming_example "Streaming Example"
 *
 */
XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
/*!
 * @brief Copies one @ref XXH32_state_t to another.
 *
 * @param dst_state The state to copy to.
 * @param src_state The state to copy from.
 * @pre
 *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
 */
XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);

/*!
 * @brief Resets an @ref XXH32_state_t to begin a new hash.
 *
 * @param statePtr The state struct to reset.
 * @param seed The 32-bit seed to alter the hash result predictably.
 *
 * @pre
 *   @p statePtr must not be `NULL`.
 *
 * @return @ref XXH_OK on success.
 * @return @ref XXH_ERROR on failure.
 *
 * @note This function resets and seeds a state. Call it before @ref XXH32_update().
 *
 * @see @ref streaming_example "Streaming Example"
 */
XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);

/*!
 * @brief Consumes a block of @p input to an @ref XXH32_state_t.
 *
 * @param statePtr The state struct to update.
 * @param input The block of data to be hashed, at least @p length bytes in size.
 * @param length The length of @p input, in bytes.
 *
 * @pre
 *   @p statePtr must not be `NULL`.
 * @pre
 *   The memory between @p input and @p input + @p length must be valid,
 *   readable, contiguous memory. However, if @p length is `0`, @p input may be
 *   `NULL`. In C++, this also must be *TriviallyCopyable*.
 *
 * @return @ref XXH_OK on success.
 * @return @ref XXH_ERROR on failure.
 *
 * @note Call this to incrementally consume blocks of data.
 *
 * @see @ref streaming_example "Streaming Example"
 */
XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);

/*!
 * @brief Returns the calculated hash value from an @ref XXH32_state_t.
 *
 * @param statePtr The state struct to calculate the hash from.
 *
 * @pre
 *  @p statePtr must not be `NULL`.
 *
 * @return The calculated 32-bit xxHash32 value from that state.
 *
 * @note
 *   Calling XXH32_digest() will not affect @p statePtr, so you can update,
 *   digest, and update again.
 *
 * @see @ref streaming_example "Streaming Example"
 */
XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);
#endif /* !XXH_NO_STREAM */

/*******   Canonical representation   *******/

/*!
 * @brief Canonical (big endian) representation of @ref XXH32_hash_t.
 */
typedef struct {
    unsigned char digest[4]; /*!< Hash bytes, big endian */
} XXH32_canonical_t;

/*!
 * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.
 *
 * @param dst  The @ref XXH32_canonical_t pointer to be stored to.
 * @param hash The @ref XXH32_hash_t to be converted.
 *
 * @pre
 *   @p dst must not be `NULL`.
 *
 * @see @ref canonical_representation_example "Canonical Representation Example"
 */
XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);

/*!
 * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.
 *
 * @param src The @ref XXH32_canonical_t to convert.
 *
 * @pre
 *   @p src must not be `NULL`.
 *
 * @return The converted hash.
 *
 * @see @ref canonical_representation_example "Canonical Representation Example"
 */
XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);


/*! @cond Doxygen ignores this part */
#ifdef __has_attribute
# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
#else
# define XXH_HAS_ATTRIBUTE(x) 0
#endif
/*! @endcond */

/*! @cond Doxygen ignores this part */
/*
 * C23 __STDC_VERSION__ number hasn't been specified yet. For now
 * leave as `201711L` (C17 + 1).
 * TODO: Update to correct value when its been specified.
 */
#define XXH_C23_VN 201711L
/*! @endcond */

/*! @cond Doxygen ignores this part */
/* C-language Attributes are added in C23. */
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute)
# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
#else
# define XXH_HAS_C_ATTRIBUTE(x) 0
#endif
/*! @endcond */

/*! @cond Doxygen ignores this part */
#if defined(__cplusplus) && defined(__has_cpp_attribute)
# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
#else
# define XXH_HAS_CPP_ATTRIBUTE(x) 0
#endif
/*! @endcond */

/*! @cond Doxygen ignores this part */
/*
 * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute
 * introduced in CPP17 and C23.
 * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough
 * C23   : https://en.cppreference.com/w/c/language/attributes/fallthrough
 */
#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough)
# define XXH_FALLTHROUGH [[fallthrough]]
#elif XXH_HAS_ATTRIBUTE(__fallthrough__)
# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__))
#else
# define XXH_FALLTHROUGH /* fallthrough */
#endif
/*! @endcond */

/*! @cond Doxygen ignores this part */
/*
 * Define XXH_NOESCAPE for annotated pointers in public API.
 * https://clang.llvm.org/docs/AttributeReference.html#noescape
 * As of writing this, only supported by clang.
 */
#if XXH_HAS_ATTRIBUTE(noescape)
# define XXH_NOESCAPE __attribute__((__noescape__))
#else
# define XXH_NOESCAPE
#endif
/*! @endcond */


/*!
 * @}
 * @ingroup public
 * @{
 */

#ifndef XXH_NO_LONG_LONG
/*-**********************************************************************
*  64-bit hash
************************************************************************/
#if defined(XXH_DOXYGEN) /* don't include <stdint.h> */
/*!
 * @brief An unsigned 64-bit integer.
 *
 * Not necessarily defined to `uint64_t` but functionally equivalent.
 */
typedef uint64_t XXH64_hash_t;
#elif !defined (__VMS) \
  && (defined (__cplusplus) \
  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
#   ifdef _AIX
#     include <inttypes.h>
#   else
#     include <stdint.h>
#   endif
   typedef uint64_t XXH64_hash_t;
#else
#  include <limits.h>
#  if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL
     /* LP64 ABI says uint64_t is unsigned long */
     typedef unsigned long XXH64_hash_t;
#  else
     /* the following type must have a width of 64-bit */
     typedef unsigned long long XXH64_hash_t;
#  endif
#endif

/*!
 * @}
 *
 * @defgroup XXH64_family XXH64 family
 * @ingroup public
 * @{
 * Contains functions used in the classic 64-bit xxHash algorithm.
 *
 * @note
 *   XXH3 provides competitive speed for both 32-bit and 64-bit systems,
 *   and offers true 64/128 bit hash results.
 *   It provides better speed for systems with vector processing capabilities.
 */

/*!
 * @brief Calculates the 64-bit hash of @p input using xxHash64.
 *
 * @param input The block of data to be hashed, at least @p length bytes in size.
 * @param length The length of @p input, in bytes.
 * @param seed The 64-bit seed to alter the hash's output predictably.
 *
 * @pre
 *   The memory between @p input and @p input + @p length must be valid,
 *   readable, contiguous memory. However, if @p length is `0`, @p input may be
 *   `NULL`. In C++, this also must be *TriviallyCopyable*.
 *
 * @return The calculated 64-bit xxHash64 value.
 *
 * @see @ref single_shot_example "Single Shot Example" for an example.
 */
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);

/*******   Streaming   *******/
#ifndef XXH_NO_STREAM
/*!
 * @brief The opaque state struct for the XXH64 streaming API.
 *
 * @see XXH64_state_s for details.
 * @see @ref streaming_example "Streaming Example"
 */
typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */

/*!
 * @brief Allocates an @ref XXH64_state_t.
 *
 * @return An allocated pointer of @ref XXH64_state_t on success.
 * @return `NULL` on failure.
 *
 * @note Must be freed with XXH64_freeState().
 *
 * @see @ref streaming_example "Streaming Example"
 */
XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void);

/*!
 * @brief Frees an @ref XXH64_state_t.
 *
 * @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState().
 *
 * @return @ref XXH_OK.
 *
 * @note @p statePtr must be allocated with XXH64_createState().
 *
 * @see @ref streaming_example "Streaming Example"
 */
XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);

/*!
 * @brief Copies one @ref XXH64_state_t to another.
 *
 * @param dst_state The state to copy to.
 * @param src_state The state to copy from.
 * @pre
 *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
 */
XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state);

/*!
 * @brief Resets an @ref XXH64_state_t to begin a new hash.
 *
 * @param statePtr The state struct to reset.
 * @param seed The 64-bit seed to alter the hash result predictably.
 *
 * @pre
 *   @p statePtr must not be `NULL`.
 *
 * @return @ref XXH_OK on success.
 * @return @ref XXH_ERROR on failure.
 *
 * @note This function resets and seeds a state. Call it before @ref XXH64_update().
 *
 * @see @ref streaming_example "Streaming Example"
 */
XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed);

/*!
 * @brief Consumes a block of @p input to an @ref XXH64_state_t.
 *
 * @param statePtr The state struct to update.
 * @param input The block of data to be hashed, at least @p length bytes in size.
 * @param length The length of @p input, in bytes.
 *
 * @pre
 *   @p statePtr must not be `NULL`.
 * @pre
 *   The memory between @p input and @p input + @p length must be valid,
 *   readable, contiguous memory. However, if @p length is `0`, @p input may be
 *   `NULL`. In C++, this also must be *TriviallyCopyable*.
 *
 * @return @ref XXH_OK on success.
 * @return @ref XXH_ERROR on failure.
 *
 * @note Call this to incrementally consume blocks of data.
 *
 * @see @ref streaming_example "Streaming Example"
 */
XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);

/*!
 * @brief Returns the calculated hash value from an @ref XXH64_state_t.
 *
 * @param statePtr The state struct to calculate the hash from.
 *
 * @pre
 *  @p statePtr must not be `NULL`.
 *
 * @return The calculated 64-bit xxHash64 value from that state.
 *
 * @note
 *   Calling XXH64_digest() will not affect @p statePtr, so you can update,
 *   digest, and update again.
 *
 * @see @ref streaming_example "Streaming Example"
 */
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr);
#endif /* !XXH_NO_STREAM */
/*******   Canonical representation   *******/

/*!
 * @brief Canonical (big endian) representation of @ref XXH64_hash_t.
 */
typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;

/*!
 * @brief Converts an @ref XXH64_hash_t to a big endian @ref XXH64_canonical_t.
 *
 * @param dst The @ref XXH64_canonical_t pointer to be stored to.
 * @param hash The @ref XXH64_hash_t to be converted.
 *
 * @pre
 *   @p dst must not be `NULL`.
 *
 * @see @ref canonical_representation_example "Canonical Representation Example"
 */
XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash);

/*!
 * @brief Converts an @ref XXH64_canonical_t to a native @ref XXH64_hash_t.
 *
 * @param src The @ref XXH64_canonical_t to convert.
 *
 * @pre
 *   @p src must not be `NULL`.
 *
 * @return The converted hash.
 *
 * @see @ref canonical_representation_example "Canonical Representation Example"
 */
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src);

#ifndef XXH_NO_XXH3

/*!
 * @}
 * ************************************************************************
 * @defgroup XXH3_family XXH3 family
 * @ingroup public
 * @{
 *
 * XXH3 is a more recent hash algorithm featuring:
 *  - Improved speed for both small and large inputs
 *  - True 64-bit and 128-bit outputs
 *  - SIMD acceleration
 *  - Improved 32-bit viability
 *
 * Speed analysis methodology is explained here:
 *
 *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
 *
 * Compared to XXH64, expect XXH3 to run approximately
 * ~2x faster on large inputs and >3x faster on small ones,
 * exact differences vary depending on platform.
 *
 * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
 * but does not require it.
 * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3
 * at competitive speeds, even without vector support. Further details are
 * explained in the implementation.
 *
 * XXH3 has a fast scalar implementation, but it also includes accelerated SIMD
 * implementations for many common platforms:
 *   - AVX512
 *   - AVX2
 *   - SSE2
 *   - ARM NEON
 *   - WebAssembly SIMD128
 *   - POWER8 VSX
 *   - s390x ZVector
 * This can be controlled via the @ref XXH_VECTOR macro, but it automatically
 * selects the best version according to predefined macros. For the x86 family, an
 * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c.
 *
 * XXH3 implementation is portable:
 * it has a generic C90 formulation that can be compiled on any platform,
 * all implementations generate exactly the same hash value on all platforms.
 * Starting from v0.8.0, it's also labelled "stable", meaning that
 * any future version will also generate the same hash value.
 *
 * XXH3 offers 2 variants, _64bits and _128bits.
 *
 * When only 64 bits are needed, prefer invoking the _64bits variant, as it
 * reduces the amount of mixing, resulting in faster speed on small inputs.
 * It's also generally simpler to manipulate a scalar return type than a struct.
 *
 * The API supports one-shot hashing, streaming mode, and custom secrets.
 */

/*!
 * @ingroup tuning
 * @brief Possible values for @ref XXH_VECTOR.
 *
 * Unless set explicitly, determined automatically.
 */
#  define XXH_SCALAR 0 /*!< Portable scalar version */
#  define XXH_SSE2   1 /*!< SSE2 for Pentium 4, Opteron, all x86_64. */
#  define XXH_AVX2   2 /*!< AVX2 for Haswell and Bulldozer */
#  define XXH_AVX512 3 /*!< AVX512 for Skylake and Icelake */
#  define XXH_NEON   4 /*!< NEON for most ARMv7-A, all AArch64, and WASM SIMD128 */
#  define XXH_VSX    5 /*!< VSX and ZVector for POWER8/z13 (64-bit) */
#  define XXH_SVE    6 /*!< SVE for some ARMv8-A and ARMv9-A */
#  define XXH_LSX    7 /*!< LSX (128-bit SIMD) for LoongArch64 */


/*-**********************************************************************
*  XXH3 64-bit variant
************************************************************************/

/*!
 * @brief Calculates 64-bit unseeded variant of XXH3 hash of @p input.
 *
 * @param input  The block of data to be hashed, at least @p length bytes in size.
 * @param length The length of @p input, in bytes.
 *
 * @pre
 *   The memory between @p input and @p input + @p length must be valid,
 *   readable, contiguous memory. However, if @p length is `0`, @p input may be
 *   `NULL`. In C++, this also must be *TriviallyCopyable*.
 *
 * @return The calculated 64-bit XXH3 hash value.
 *
 * @note
 *   This is equivalent to @ref XXH3_64bits_withSeed() with a seed of `0`, however
 *   it may have slightly better performance due to constant propagation of the
 *   defaults.
 *
 * @see
 *    XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants
 * @see @ref single_shot_example "Single Shot Example" for an example.
 */
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length);

/*!
 * @brief Calculates 64-bit seeded variant of XXH3 hash of @p input.
 *
 * @param input  The block of data to be hashed, at least @p length bytes in size.
 * @param length The length of @p input, in bytes.
 * @param seed   The 64-bit seed to alter the hash result predictably.
 *
 * @pre
 *   The memory between @p input and @p input + @p length must be valid,
 *   readable, contiguous memory. However, if @p length is `0`, @p input may be
 *   `NULL`. In C++, this also must be *TriviallyCopyable*.
 *
 * @return The calculated 64-bit XXH3 hash value.
 *
 * @note
 *    seed == 0 produces the same results as @ref XXH3_64bits().
 *
 * This variant generates a custom secret on the fly based on default secret
 * altered using the @p seed value.
 *
 * While this operation is decently fast, note that it's not completely free.
 *
 * @see @ref single_shot_example "Single Shot Example" for an example.
 */
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);

/*!
 * The bare minimum size for a custom secret.
 *
 * @see
 *  XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),
 *  XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().
 */
#define XXH3_SECRET_SIZE_MIN 136

/*!
 * @brief Calculates 64-bit variant of XXH3 with a custom "secret".
 *
 * @param data       The block of data to be hashed, at least @p len bytes in size.
 * @param len        The length of @p data, in bytes.
 * @param secret     The secret data.
 * @param secretSize The length of @p secret, in bytes.
 *
 * @return The calculated 64-bit XXH3 hash value.
 *
 * @pre
 *   The memory between @p data and @p data + @p len must be valid,
 *   readable, contiguous memory. However, if @p length is `0`, @p data may be
 *   `NULL`. In C++, this also must be *TriviallyCopyable*.
 *
 * It's possible to provide any blob of bytes as a "secret" to generate the hash.
 * This makes it more difficult for an external actor to prepare an intentional collision.
 * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN).
 * However, the quality of the secret impacts the dispersion of the hash algorithm.
 * Therefore, the secret _must_ look like a bunch of random bytes.
 * Avoid "trivial" or structured data such as repeated sequences or a text document.
 * Whenever in doubt about the "randomness" of the blob of bytes,
 * consider employing @ref XXH3_generateSecret() instead (see below).
 * It will generate a proper high entropy secret derived from the blob of bytes.
 * Another advantage of using XXH3_generateSecret() is that
 * it guarantees that all bits within the initial blob of bytes
 * will impact every bit of the output.
 * This is not necessarily the case when using the blob of bytes directly
 * because, when hashing _small_ inputs, only a portion of the secret is employed.
 *
 * @see @ref single_shot_example "Single Shot Example" for an example.
 */
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);


/*******   Streaming   *******/
#ifndef XXH_NO_STREAM
/*
 * Streaming requires state maintenance.
 * This operation costs memory and CPU.
 * As a consequence, streaming is slower than one-shot hashing.
 * For better performance, prefer one-shot functions whenever applicable.
 */

/*!
 * @brief The opaque state struct for the XXH3 streaming API.
 *
 * @see XXH3_state_s for details.
 * @see @ref streaming_example "Streaming Example"
 */
typedef struct XXH3_state_s XXH3_state_t;
XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void);
XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);

/*!
 * @brief Copies one @ref XXH3_state_t to another.
 *
 * @param dst_state The state to copy to.
 * @param src_state The state to copy from.
 * @pre
 *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
 */
XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state);

/*!
 * @brief Resets an @ref XXH3_state_t to begin a new hash.
 *
 * @param statePtr The state struct to reset.
 *
 * @pre
 *   @p statePtr must not be `NULL`.
 *
 * @return @ref XXH_OK on success.
 * @return @ref XXH_ERROR on failure.
 *
 * @note
 *   - This function resets `statePtr` and generate a secret with default parameters.
 *   - Call this function before @ref XXH3_64bits_update().
 *   - Digest will be equivalent to `XXH3_64bits()`.
 *
 * @see @ref streaming_example "Streaming Example"
 *
 */
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);

/*!
 * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash.
 *
 * @param statePtr The state struct to reset.
 * @param seed     The 64-bit seed to alter the hash result predictably.
 *
 * @pre
 *   @p statePtr must not be `NULL`.
 *
 * @return @ref XXH_OK on success.
 * @return @ref XXH_ERROR on failure.
 *
 * @note
 *   - This function resets `statePtr` and generate a secret from `seed`.
 *   - Call this function before @ref XXH3_64bits_update().
 *   - Digest will be equivalent to `XXH3_64bits_withSeed()`.
 *
 * @see @ref streaming_example "Streaming Example"
 *
 */
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);

/*!
 * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
 *
 * @param statePtr The state struct to reset.
 * @param secret     The secret data.
 * @param secretSize The length of @p secret, in bytes.
 *
 * @pre
 *   @p statePtr must not be `NULL`.
 *
 * @return @ref XXH_OK on success.
 * @return @ref XXH_ERROR on failure.
 *
 * @note
 *   `secret` is referenced, it _must outlive_ the hash streaming session.
 *
 * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN,
 * and the quality of produced hash values depends on secret's entropy
 * (secret's content should look like a bunch of random bytes).
 * When in doubt about the randomness of a candidate `secret`,
 * consider employing `XXH3_generateSecret()` instead (see below).
 *
 * @see @ref streaming_example "Streaming Example"
 */
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);

/*!
 * @brief Consumes a block of @p input to an @ref XXH3_state_t.
 *
 * @param statePtr The state struct to update.
 * @param input The block of data to be hashed, at least @p length bytes in size.
 * @param length The length of @p input, in bytes.
 *
 * @pre
 *   @p statePtr must not be `NULL`.
 * @pre
 *   The memory between @p input and @p input + @p length must be valid,
 *   readable, contiguous memory. However, if @p length is `0`, @p input may be
 *   `NULL`. In C++, this also must be *TriviallyCopyable*.
 *
 * @return @ref XXH_OK on success.
 * @return @ref XXH_ERROR on failure.
 *
 * @note Call this to incrementally consume blocks of data.
 *
 * @see @ref streaming_example "Streaming Example"
 */
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);

/*!
 * @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t.
 *
 * @param statePtr The state struct to calculate the hash from.
 *
 * @pre
 *  @p statePtr must not be `NULL`.
 *
 * @return The calculated XXH3 64-bit hash value from that state.
 *
 * @note
 *   Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update,
 *   digest, and update again.
 *
 * @see @ref streaming_example "Streaming Example"
 */
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t  XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
#endif /* !XXH_NO_STREAM */

/* note : canonical representation of XXH3 is the same as XXH64
 * since they both produce XXH64_hash_t values */


/*-**********************************************************************
*  XXH3 128-bit variant
************************************************************************/

/*!
 * @brief The return value from 128-bit hashes.
 *
 * Stored in little endian order, although the fields themselves are in native
 * endianness.
 */
typedef struct {
    XXH64_hash_t low64;   /*!< `value & 0xFFFFFFFFFFFFFFFF` */
    XXH64_hash_t high64;  /*!< `value >> 64` */
} XXH128_hash_t;

/*!
 * @brief Calculates 128-bit unseeded variant of XXH3 of @p data.
 *
 * @param data The block of data to be hashed, at least @p length bytes in size.
 * @param len  The length of @p data, in bytes.
 *
 * @return The calculated 128-bit variant of XXH3 value.
 *
 * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead
 * for shorter inputs.
 *
 * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of `0`, however
 * it may have slightly better performance due to constant propagation of the
 * defaults.
 *
 * @see XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants
 * @see @ref single_shot_example "Single Shot Example" for an example.
 */
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len);
/*! @brief Calculates 128-bit seeded variant of XXH3 hash of @p data.
 *
 * @param data The block of data to be hashed, at least @p length bytes in size.
 * @param len  The length of @p data, in bytes.
 * @param seed The 64-bit seed to alter the hash result predictably.
 *
 * @return The calculated 128-bit variant of XXH3 value.
 *
 * @note
 *    seed == 0 produces the same results as @ref XXH3_64bits().
 *
 * This variant generates a custom secret on the fly based on default secret
 * altered using the @p seed value.
 *
 * While this operation is decently fast, note that it's not completely free.
 *
 * @see XXH3_128bits(), XXH3_128bits_withSecret(): other seeding variants
 * @see @ref single_shot_example "Single Shot Example" for an example.
 */
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
/*!
 * @brief Calculates 128-bit variant of XXH3 with a custom "secret".
 *
 * @param data       The block of data to be hashed, at least @p len bytes in size.
 * @param len        The length of @p data, in bytes.
 * @param secret     The secret data.
 * @param secretSize The length of @p secret, in bytes.
 *
 * @return The calculated 128-bit variant of XXH3 value.
 *
 * It's possible to provide any blob of bytes as a "secret" to generate the hash.
 * This makes it more difficult for an external actor to prepare an intentional collision.
 * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN).
 * However, the quality of the secret impacts the dispersion of the hash algorithm.
 * Therefore, the secret _must_ look like a bunch of random bytes.
 * Avoid "trivial" or structured data such as repeated sequences or a text document.
 * Whenever in doubt about the "randomness" of the blob of bytes,
 * consider employing @ref XXH3_generateSecret() instead (see below).
 * It will generate a proper high entropy secret derived from the blob of bytes.
 * Another advantage of using XXH3_generateSecret() is that
 * it guarantees that all bits within the initial blob of bytes
 * will impact every bit of the output.
 * This is not necessarily the case when using the blob of bytes directly
 * because, when hashing _small_ inputs, only a portion of the secret is employed.
 *
 * @see @ref single_shot_example "Single Shot Example" for an example.
 */
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);

/*******   Streaming   *******/
#ifndef XXH_NO_STREAM
/*
 * Streaming requires state maintenance.
 * This operation costs memory and CPU.
 * As a consequence, streaming is slower than one-shot hashing.
 * For better performance, prefer one-shot functions whenever applicable.
 *
 * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
 * Use already declared XXH3_createState() and XXH3_freeState().
 *
 * All reset and streaming functions have same meaning as their 64-bit counterpart.
 */

/*!
 * @brief Resets an @ref XXH3_state_t to begin a new hash.
 *
 * @param statePtr The state struct to reset.
 *
 * @pre
 *   @p statePtr must not be `NULL`.
 *
 * @return @ref XXH_OK on success.
 * @return @ref XXH_ERROR on failure.
 *
 * @note
 *   - This function resets `statePtr` and generate a secret with default parameters.
 *   - Call it before @ref XXH3_128bits_update().
 *   - Digest will be equivalent to `XXH3_128bits()`.
 *
 * @see @ref streaming_example "Streaming Example"
 */
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);

/*!
 * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash.
 *
 * @param statePtr The state struct to reset.
 * @param seed     The 64-bit seed to alter the hash result predictably.
 *
 * @pre
 *   @p statePtr must not be `NULL`.
 *
 * @return @ref XXH_OK on success.
 * @return @ref XXH_ERROR on failure.
 *
 * @note
 *   - This function resets `statePtr` and generate a secret from `seed`.
 *   - Call it before @ref XXH3_128bits_update().
 *   - Digest will be equivalent to `XXH3_128bits_withSeed()`.
 *
 * @see @ref streaming_example "Streaming Example"
 */
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
/*!
 * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
 *
 * @param statePtr   The state struct to reset.
 * @param secret     The secret data.
 * @param secretSize The length of @p secret, in bytes.
 *
 * @pre
 *   @p statePtr must not be `NULL`.
 *
 * @return @ref XXH_OK on success.
 * @return @ref XXH_ERROR on failure.
 *
 * `secret` is referenced, it _must outlive_ the hash streaming session.
 * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN,
 * and the quality of produced hash values depends on secret's entropy
 * (secret's content should look like a bunch of random bytes).
 * When in doubt about the randomness of a candidate `secret`,
 * consider employing `XXH3_generateSecret()` instead (see below).
 *
 * @see @ref streaming_example "Streaming Example"
 */
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);

/*!
 * @brief Consumes a block of @p input to an @ref XXH3_state_t.
 *
 * Call this to incrementally consume blocks of data.
 *
 * @param statePtr The state struct to update.
 * @param input The block of data to be hashed, at least @p length bytes in size.
 * @param length The length of @p input, in bytes.
 *
 * @pre
 *   @p statePtr must not be `NULL`.
 *
 * @return @ref XXH_OK on success.
 * @return @ref XXH_ERROR on failure.
 *
 * @note
 *   The memory between @p input and @p input + @p length must be valid,
 *   readable, contiguous memory. However, if @p length is `0`, @p input may be
 *   `NULL`. In C++, this also must be *TriviallyCopyable*.
 *
 */
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);

/*!
 * @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t.
 *
 * @param statePtr The state struct to calculate the hash from.
 *
 * @pre
 *  @p statePtr must not be `NULL`.
 *
 * @return The calculated XXH3 128-bit hash value from that state.
 *
 * @note
 *   Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update,
 *   digest, and update again.
 *
 */
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
#endif /* !XXH_NO_STREAM */

/* Following helper functions make it possible to compare XXH128_hast_t values.
 * Since XXH128_hash_t is a structure, this capability is not offered by the language.
 * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */

/*!
 * @brief Check equality of two XXH128_hash_t values
 *
 * @param h1 The 128-bit hash value.
 * @param h2 Another 128-bit hash value.
 *
 * @return `1` if `h1` and `h2` are equal.
 * @return `0` if they are not.
 */
XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);

/*!
 * @brief Compares two @ref XXH128_hash_t
 *
 * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
 *
 * @param h128_1 Left-hand side value
 * @param h128_2 Right-hand side value
 *
 * @return >0 if @p h128_1  > @p h128_2
 * @return =0 if @p h128_1 == @p h128_2
 * @return <0 if @p h128_1  < @p h128_2
 */
XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2);


/*******   Canonical representation   *******/
typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;


/*!
 * @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t.
 *
 * @param dst  The @ref XXH128_canonical_t pointer to be stored to.
 * @param hash The @ref XXH128_hash_t to be converted.
 *
 * @pre
 *   @p dst must not be `NULL`.
 * @see @ref canonical_representation_example "Canonical Representation Example"
 */
XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash);

/*!
 * @brief Converts an @ref XXH128_canonical_t to a native @ref XXH128_hash_t.
 *
 * @param src The @ref XXH128_canonical_t to convert.
 *
 * @pre
 *   @p src must not be `NULL`.
 *
 * @return The converted hash.
 * @see @ref canonical_representation_example "Canonical Representation Example"
 */
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src);


#endif  /* !XXH_NO_XXH3 */
#endif  /* XXH_NO_LONG_LONG */

/*!
 * @}
 */
#endif /* XXHASH_H_5627135585666179 */


#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
#define XXHASH_H_STATIC_13879238742
/* ****************************************************************************
 * This section contains declarations which are not guaranteed to remain stable.
 * They may change in future versions, becoming incompatible with a different
 * version of the library.
 * These declarations should only be used with static linking.
 * Never use them in association with dynamic linking!
 ***************************************************************************** */

/*
 * These definitions are only present to allow static allocation
 * of XXH states, on stack or in a struct, for example.
 * Never **ever** access their members directly.
 */

/*!
 * @internal
 * @brief Structure for XXH32 streaming API.
 *
 * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
 * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
 * an opaque type. This allows fields to safely be changed.
 *
 * Typedef'd to @ref XXH32_state_t.
 * Do not access the members of this struct directly.
 * @see XXH64_state_s, XXH3_state_s
 */
struct XXH32_state_s {
   XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */
   XXH32_hash_t large_len;    /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */
   XXH32_hash_t acc[4];       /*!< Accumulator lanes */
   unsigned char buffer[16];  /*!< Internal buffer for partial reads. */
   XXH32_hash_t bufferedSize; /*!< Amount of data in @ref buffer */
   XXH32_hash_t reserved;     /*!< Reserved field. Do not read nor write to it. */
};   /* typedef'd to XXH32_state_t */


#ifndef XXH_NO_LONG_LONG  /* defined when there is no 64-bit support */

/*!
 * @internal
 * @brief Structure for XXH64 streaming API.
 *
 * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
 * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
 * an opaque type. This allows fields to safely be changed.
 *
 * Typedef'd to @ref XXH64_state_t.
 * Do not access the members of this struct directly.
 * @see XXH32_state_s, XXH3_state_s
 */
struct XXH64_state_s {
   XXH64_hash_t total_len;    /*!< Total length hashed. This is always 64-bit. */
   XXH64_hash_t acc[4];       /*!< Accumulator lanes */
   unsigned char buffer[32];  /*!< Internal buffer for partial reads.. */
   XXH32_hash_t bufferedSize; /*!< Amount of data in @ref buffer */
   XXH32_hash_t reserved32;   /*!< Reserved field, needed for padding anyways*/
   XXH64_hash_t reserved64;   /*!< Reserved field. Do not read or write to it. */
};   /* typedef'd to XXH64_state_t */

#ifndef XXH_NO_XXH3

#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */
#  define XXH_ALIGN(n)      _Alignas(n)
#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */
/* In C++ alignas() is a keyword */
#  define XXH_ALIGN(n)      alignas(n)
#elif defined(__GNUC__)
#  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
#elif defined(_MSC_VER)
#  define XXH_ALIGN(n)      __declspec(align(n))
#else
#  define XXH_ALIGN(n)   /* disabled */
#endif

/* Old GCC versions only accept the attribute after the type in structures. */
#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))   /* C11+ */ \
    && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \
    && defined(__GNUC__)
#   define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
#else
#   define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
#endif

/*!
 * @brief The size of the internal XXH3 buffer.
 *
 * This is the optimal update size for incremental hashing.
 *
 * @see XXH3_64b_update(), XXH3_128b_update().
 */
#define XXH3_INTERNALBUFFER_SIZE 256

/*!
 * @internal
 * @brief Default size of the secret buffer (and @ref XXH3_kSecret).
 *
 * This is the size used in @ref XXH3_kSecret and the seeded functions.
 *
 * Not to be confused with @ref XXH3_SECRET_SIZE_MIN.
 */
#define XXH3_SECRET_DEFAULT_SIZE 192

/*!
 * @internal
 * @brief Structure for XXH3 streaming API.
 *
 * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
 * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.
 * Otherwise it is an opaque type.
 * Never use this definition in combination with dynamic library.
 * This allows fields to safely be changed in the future.
 *
 * @note ** This structure has a strict alignment requirement of 64 bytes!! **
 * Do not allocate this with `malloc()` or `new`,
 * it will not be sufficiently aligned.
 * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.
 *
 * Typedef'd to @ref XXH3_state_t.
 * Do never access the members of this struct directly.
 *
 * @see XXH3_INITSTATE() for stack initialization.
 * @see XXH3_createState(), XXH3_freeState().
 * @see XXH32_state_s, XXH64_state_s
 */
struct XXH3_state_s {
   XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
       /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */
   XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
       /*!< Used to store a custom secret generated from a seed. */
   XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
       /*!< The internal buffer. @see XXH32_state_s::mem32 */
   XXH32_hash_t bufferedSize;
       /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
   XXH32_hash_t useSeed;
       /*!< Reserved field. Needed for padding on 64-bit. */
   size_t nbStripesSoFar;
       /*!< Number or stripes processed. */
   XXH64_hash_t totalLen;
       /*!< Total length hashed. 64-bit even on 32-bit targets. */
   size_t nbStripesPerBlock;
       /*!< Number of stripes per block. */
   size_t secretLimit;
       /*!< Size of @ref customSecret or @ref extSecret */
   XXH64_hash_t seed;
       /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */
   XXH64_hash_t reserved64;
       /*!< Reserved field. */
   const unsigned char* extSecret;
       /*!< Reference to an external secret for the _withSecret variants, NULL
        *   for other variants. */
   /* note: there may be some padding at the end due to alignment on 64 bytes */
}; /* typedef'd to XXH3_state_t */

#undef XXH_ALIGN_MEMBER

/*!
 * @brief Initializes a stack-allocated `XXH3_state_s`.
 *
 * When the @ref XXH3_state_t structure is merely emplaced on stack,
 * it should be initialized with XXH3_INITSTATE() or a memset()
 * in case its first reset uses XXH3_NNbits_reset_withSeed().
 * This init can be omitted if the first reset uses default or _withSecret mode.
 * This operation isn't necessary when the state is created with XXH3_createState().
 * Note that this doesn't prepare the state for a streaming operation,
 * it's still necessary to use XXH3_NNbits_reset*() afterwards.
 */
#define XXH3_INITSTATE(XXH3_state_ptr)                       \
    do {                                                     \
        XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \
        tmp_xxh3_state_ptr->seed = 0;                        \
        tmp_xxh3_state_ptr->extSecret = NULL;                \
    } while(0)


/*!
 * @brief Calculates the 128-bit hash of @p data using XXH3.
 *
 * @param data The block of data to be hashed, at least @p len bytes in size.
 * @param len  The length of @p data, in bytes.
 * @param seed The 64-bit seed to alter the hash's output predictably.
 *
 * @pre
 *   The memory between @p data and @p data + @p len must be valid,
 *   readable, contiguous memory. However, if @p len is `0`, @p data may be
 *   `NULL`. In C++, this also must be *TriviallyCopyable*.
 *
 * @return The calculated 128-bit XXH3 value.
 *
 * @see @ref single_shot_example "Single Shot Example" for an example.
 */
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);


/* ===   Experimental API   === */
/* Symbols defined below must be considered tied to a specific library version. */

/*!
 * @brief Derive a high-entropy secret from any user-defined content, named customSeed.
 *
 * @param secretBuffer    A writable buffer for derived high-entropy secret data.
 * @param secretSize      Size of secretBuffer, in bytes.  Must be >= XXH3_SECRET_SIZE_MIN.
 * @param customSeed      A user-defined content.
 * @param customSeedSize  Size of customSeed, in bytes.
 *
 * @return @ref XXH_OK on success.
 * @return @ref XXH_ERROR on failure.
 *
 * The generated secret can be used in combination with `*_withSecret()` functions.
 * The `_withSecret()` variants are useful to provide a higher level of protection
 * than 64-bit seed, as it becomes much more difficult for an external actor to
 * guess how to impact the calculation logic.
 *
 * The function accepts as input a custom seed of any length and any content,
 * and derives from it a high-entropy secret of length @p secretSize into an
 * already allocated buffer @p secretBuffer.
 *
 * The generated secret can then be used with any `*_withSecret()` variant.
 * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(),
 * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret()
 * are part of this list. They all accept a `secret` parameter
 * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN)
 * _and_ feature very high entropy (consist of random-looking bytes).
 * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can
 * be employed to ensure proper quality.
 *
 * @p customSeed can be anything. It can have any size, even small ones,
 * and its content can be anything, even "poor entropy" sources such as a bunch
 * of zeroes. The resulting `secret` will nonetheless provide all required qualities.
 *
 * @pre
 *   - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN
 *   - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
 *
 * Example code:
 * @code{.c}
 *    #include <stdio.h>
 *    #include <stdlib.h>
 *    #include <string.h>
 *    #define XXH_STATIC_LINKING_ONLY // expose unstable API
 *    #include "xxhash.h"
 *    // Hashes argv[2] using the entropy from argv[1].
 *    int main(int argc, char* argv[])
 *    {
 *        char secret[XXH3_SECRET_SIZE_MIN];
 *        if (argv != 3) { return 1; }
 *        XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1]));
 *        XXH64_hash_t h = XXH3_64bits_withSecret(
 *             argv[2], strlen(argv[2]),
 *             secret, sizeof(secret)
 *        );
 *        printf("%016llx\n", (unsigned long long) h);
 *    }
 * @endcode
 */
XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize);

/*!
 * @brief Generate the same secret as the _withSeed() variants.
 *
 * @param secretBuffer A writable buffer of @ref XXH3_SECRET_DEFAULT_SIZE bytes
 * @param seed         The 64-bit seed to alter the hash result predictably.
 *
 * The generated secret can be used in combination with
 *`*_withSecret()` and `_withSecretandSeed()` variants.
 *
 * Example C++ `std::string` hash class:
 * @code{.cpp}
 *    #include <string>
 *    #define XXH_STATIC_LINKING_ONLY // expose unstable API
 *    #include "xxhash.h"
 *    // Slow, seeds each time
 *    class HashSlow {
 *        XXH64_hash_t seed;
 *    public:
 *        HashSlow(XXH64_hash_t s) : seed{s} {}
 *        size_t operator()(const std::string& x) const {
 *            return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)};
 *        }
 *    };
 *    // Fast, caches the seeded secret for future uses.
 *    class HashFast {
 *        unsigned char secret[XXH3_SECRET_DEFAULT_SIZE];
 *    public:
 *        HashFast(XXH64_hash_t s) {
 *            XXH3_generateSecret_fromSeed(secret, seed);
 *        }
 *        size_t operator()(const std::string& x) const {
 *            return size_t{
 *                XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret))
 *            };
 *        }
 *    };
 * @endcode
 */
XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed);

/*!
 * @brief Maximum size of "short" key in bytes.
 */
#define XXH3_MIDSIZE_MAX 240

/*!
 * @brief Calculates 64/128-bit seeded variant of XXH3 hash of @p data.
 *
 * @param data       The block of data to be hashed, at least @p len bytes in size.
 * @param len        The length of @p data, in bytes.
 * @param secret     The secret data.
 * @param secretSize The length of @p secret, in bytes.
 * @param seed       The 64-bit seed to alter the hash result predictably.
 *
 * These variants generate hash values using either:
 * - @p seed for "short" keys (< @ref XXH3_MIDSIZE_MAX = 240 bytes)
 * - @p secret for "large" keys (>= @ref XXH3_MIDSIZE_MAX).
 *
 * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`.
 * `_withSeed()` has to generate the secret on the fly for "large" keys.
 * It's fast, but can be perceptible for "not so large" keys (< 1 KB).
 * `_withSecret()` has to generate the masks on the fly for "small" keys,
 * which requires more instructions than _withSeed() variants.
 * Therefore, _withSecretandSeed variant combines the best of both worlds.
 *
 * When @p secret has been generated by XXH3_generateSecret_fromSeed(),
 * this variant produces *exactly* the same results as `_withSeed()` variant,
 * hence offering only a pure speed benefit on "large" input,
 * by skipping the need to regenerate the secret for every large input.
 *
 * Another usage scenario is to hash the secret to a 64-bit hash value,
 * for example with XXH3_64bits(), which then becomes the seed,
 * and then employ both the seed and the secret in _withSecretandSeed().
 * On top of speed, an added benefit is that each bit in the secret
 * has a 50% chance to swap each bit in the output, via its impact to the seed.
 *
 * This is not guaranteed when using the secret directly in "small data" scenarios,
 * because only portions of the secret are employed for small data.
 */
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t
XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len,
                              XXH_NOESCAPE const void* secret, size_t secretSize,
                              XXH64_hash_t seed);

/*!
 * @brief Calculates 128-bit seeded variant of XXH3 hash of @p data.
 *
 * @param data       The memory segment to be hashed, at least @p len bytes in size.
 * @param length     The length of @p data, in bytes.
 * @param secret     The secret used to alter hash result predictably.
 * @param secretSize The length of @p secret, in bytes (must be >= XXH3_SECRET_SIZE_MIN)
 * @param seed64     The 64-bit seed to alter the hash result predictably.
 *
 * @return @ref XXH_OK on success.
 * @return @ref XXH_ERROR on failure.
 *
 * @see XXH3_64bits_withSecretandSeed(): contract is the same.
 */
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t
XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length,
                               XXH_NOESCAPE const void* secret, size_t secretSize,
                               XXH64_hash_t seed64);

#ifndef XXH_NO_STREAM
/*!
 * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
 *
 * @param statePtr   A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
 * @param secret     The secret data.
 * @param secretSize The length of @p secret, in bytes.
 * @param seed64     The 64-bit seed to alter the hash result predictably.
 *
 * @return @ref XXH_OK on success.
 * @return @ref XXH_ERROR on failure.
 *
 * @see XXH3_64bits_withSecretandSeed(). Contract is identical.
 */
XXH_PUBLIC_API XXH_errorcode
XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
                                    XXH_NOESCAPE const void* secret, size_t secretSize,
                                    XXH64_hash_t seed64);

/*!
 * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
 *
 * @param statePtr   A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
 * @param secret     The secret data.
 * @param secretSize The length of @p secret, in bytes.
 * @param seed64     The 64-bit seed to alter the hash result predictably.
 *
 * @return @ref XXH_OK on success.
 * @return @ref XXH_ERROR on failure.
 *
 * @see XXH3_64bits_withSecretandSeed(). Contract is identical.
 *
 * Note: there was a bug in an earlier version of this function (<= v0.8.2)
 * that would make it generate an incorrect hash value
 * when @p seed == 0 and @p length < XXH3_MIDSIZE_MAX
 * and @p secret is different from XXH3_generateSecret_fromSeed().
 * As stated in the contract, the correct hash result must be
 * the same as XXH3_128bits_withSeed() when @p length <= XXH3_MIDSIZE_MAX.
 * Results generated by this older version are wrong, hence not comparable.
 */
XXH_PUBLIC_API XXH_errorcode
XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
                                     XXH_NOESCAPE const void* secret, size_t secretSize,
                                     XXH64_hash_t seed64);

#endif /* !XXH_NO_STREAM */

#endif  /* !XXH_NO_XXH3 */
#endif  /* XXH_NO_LONG_LONG */
#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
#  define XXH_IMPLEMENTATION
#endif

#endif  /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */


/* ======================================================================== */
/* ======================================================================== */
/* ======================================================================== */


/*-**********************************************************************
 * xxHash implementation
 *-**********************************************************************
 * xxHash's implementation used to be hosted inside xxhash.c.
 *
 * However, inlining requires implementation to be visible to the compiler,
 * hence be included alongside the header.
 * Previously, implementation was hosted inside xxhash.c,
 * which was then #included when inlining was activated.
 * This construction created issues with a few build and install systems,
 * as it required xxhash.c to be stored in /include directory.
 *
 * xxHash implementation is now directly integrated within xxhash.h.
 * As a consequence, xxhash.c is no longer needed in /include.
 *
 * xxhash.c is still available and is still useful.
 * In a "normal" setup, when xxhash is not inlined,
 * xxhash.h only exposes the prototypes and public symbols,
 * while xxhash.c can be built into an object file xxhash.o
 * which can then be linked into the final binary.
 ************************************************************************/

#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \
   || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
#  define XXH_IMPLEM_13a8737387

/* *************************************
*  Tuning parameters
***************************************/

/*!
 * @defgroup tuning Tuning parameters
 * @{
 *
 * Various macros to control xxHash's behavior.
 */
#ifdef XXH_DOXYGEN
/*!
 * @brief Define this to disable 64-bit code.
 *
 * Useful if only using the @ref XXH32_family and you have a strict C90 compiler.
 */
#  define XXH_NO_LONG_LONG
#  undef XXH_NO_LONG_LONG /* don't actually */
/*!
 * @brief Controls how unaligned memory is accessed.
 *
 * By default, access to unaligned memory is controlled by `memcpy()`, which is
 * safe and portable.
 *
 * Unfortunately, on some target/compiler combinations, the generated assembly
 * is sub-optimal.
 *
 * The below switch allow selection of a different access method
 * in the search for improved performance.
 *
 * @par Possible options:
 *
 *  - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
 *   @par
 *     Use `memcpy()`. Safe and portable. Note that most modern compilers will
 *     eliminate the function call and treat it as an unaligned access.
 *
 *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))`
 *   @par
 *     Depends on compiler extensions and is therefore not portable.
 *     This method is safe _if_ your compiler supports it,
 *     and *generally* as fast or faster than `memcpy`.
 *
 *  - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
 *  @par
 *     Casts directly and dereferences. This method doesn't depend on the
 *     compiler, but it violates the C standard as it directly dereferences an
 *     unaligned pointer. It can generate buggy code on targets which do not
 *     support unaligned memory accesses, but in some circumstances, it's the
 *     only known way to get the most performance.
 *
 *  - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
 *  @par
 *     Also portable. This can generate the best code on old compilers which don't
 *     inline small `memcpy()` calls, and it might also be faster on big-endian
 *     systems which lack a native byteswap instruction. However, some compilers
 *     will emit literal byteshifts even if the target supports unaligned access.
 *
 *
 * @warning
 *   Methods 1 and 2 rely on implementation-defined behavior. Use these with
 *   care, as what works on one compiler/platform/optimization level may cause
 *   another to read garbage data or even crash.
 *
 * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
 *
 * Prefer these methods in priority order (0 > 3 > 1 > 2)
 */
#  define XXH_FORCE_MEMORY_ACCESS 0

/*!
 * @def XXH_SIZE_OPT
 * @brief Controls how much xxHash optimizes for size.
 *
 * xxHash, when compiled, tends to result in a rather large binary size. This
 * is mostly due to heavy usage to forced inlining and constant folding of the
 * @ref XXH3_family to increase performance.
 *
 * However, some developers prefer size over speed. This option can
 * significantly reduce the size of the generated code. When using the `-Os`
 * or `-Oz` options on GCC or Clang, this is defined to 1 by default,
 * otherwise it is defined to 0.
 *
 * Most of these size optimizations can be controlled manually.
 *
 * This is a number from 0-2.
 *  - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed
 *    comes first.
 *  - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more
 *    conservative and disables hacks that increase code size. It implies the
 *    options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0,
 *    and @ref XXH3_NEON_LANES == 8 if they are not already defined.
 *  - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible.
 *    Performance may cry. For example, the single shot functions just use the
 *    streaming API.
 */
#  define XXH_SIZE_OPT 0

/*!
 * @def XXH_FORCE_ALIGN_CHECK
 * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()
 * and XXH64() only).
 *
 * This is an important performance trick for architectures without decent
 * unaligned memory access performance.
 *
 * It checks for input alignment, and when conditions are met, uses a "fast
 * path" employing direct 32-bit/64-bit reads, resulting in _dramatically
 * faster_ read speed.
 *
 * The check costs one initial branch per hash, which is generally negligible,
 * but not zero.
 *
 * Moreover, it's not useful to generate an additional code path if memory
 * access uses the same instruction for both aligned and unaligned
 * addresses (e.g. x86 and aarch64).
 *
 * In these cases, the alignment check can be removed by setting this macro to 0.
 * Then the code will always use unaligned memory access.
 * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips
 * which are platforms known to offer good unaligned memory accesses performance.
 *
 * It is also disabled by default when @ref XXH_SIZE_OPT >= 1.
 *
 * This option does not affect XXH3 (only XXH32 and XXH64).
 */
#  define XXH_FORCE_ALIGN_CHECK 0

/*!
 * @def XXH_NO_INLINE_HINTS
 * @brief When non-zero, sets all functions to `static`.
 *
 * By default, xxHash tries to force the compiler to inline almost all internal
 * functions.
 *
 * This can usually improve performance due to reduced jumping and improved
 * constant folding, but significantly increases the size of the binary which
 * might not be favorable.
 *
 * Additionally, sometimes the forced inlining can be detrimental to performance,
 * depending on the architecture.
 *
 * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
 * compiler full control on whether to inline or not.
 *
 * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if
 * @ref XXH_SIZE_OPT >= 1, this will automatically be defined.
 */
#  define XXH_NO_INLINE_HINTS 0

/*!
 * @def XXH3_INLINE_SECRET
 * @brief Determines whether to inline the XXH3 withSecret code.
 *
 * When the secret size is known, the compiler can improve the performance
 * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret().
 *
 * However, if the secret size is not known, it doesn't have any benefit. This
 * happens when xxHash is compiled into a global symbol. Therefore, if
 * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0.
 *
 * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers
 * that are *sometimes* force inline on -Og, and it is impossible to automatically
 * detect this optimization level.
 */
#  define XXH3_INLINE_SECRET 0

/*!
 * @def XXH32_ENDJMP
 * @brief Whether to use a jump for `XXH32_finalize`.
 *
 * For performance, `XXH32_finalize` uses multiple branches in the finalizer.
 * This is generally preferable for performance,
 * but depending on exact architecture, a jmp may be preferable.
 *
 * This setting is only possibly making a difference for very small inputs.
 */
#  define XXH32_ENDJMP 0

/*!
 * @internal
 * @brief Redefines old internal names.
 *
 * For compatibility with code that uses xxHash's internals before the names
 * were changed to improve namespacing. There is no other reason to use this.
 */
#  define XXH_OLD_NAMES
#  undef XXH_OLD_NAMES /* don't actually use, it is ugly. */

/*!
 * @def XXH_NO_STREAM
 * @brief Disables the streaming API.
 *
 * When xxHash is not inlined and the streaming functions are not used, disabling
 * the streaming functions can improve code size significantly, especially with
 * the @ref XXH3_family which tends to make constant folded copies of itself.
 */
#  define XXH_NO_STREAM
#  undef XXH_NO_STREAM /* don't actually */
#endif /* XXH_DOXYGEN */
/*!
 * @}
 */

#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
   /* prefer __packed__ structures (method 1) for GCC
    * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy
    * which for some reason does unaligned loads. */
#  if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED))
#    define XXH_FORCE_MEMORY_ACCESS 1
#  endif
#endif

#ifndef XXH_SIZE_OPT
   /* default to 1 for -Os or -Oz */
#  if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__)
#    define XXH_SIZE_OPT 1
#  else
#    define XXH_SIZE_OPT 0
#  endif
#endif

#ifndef XXH_FORCE_ALIGN_CHECK  /* can be defined externally */
   /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */
#  if XXH_SIZE_OPT >= 1 || \
      defined(__i386)  || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \
   || defined(_M_IX86) || defined(_M_X64)     || defined(_M_ARM64)    || defined(_M_ARM) /* visual */
#    define XXH_FORCE_ALIGN_CHECK 0
#  else
#    define XXH_FORCE_ALIGN_CHECK 1
#  endif
#endif

#ifndef XXH_NO_INLINE_HINTS
#  if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__)  /* -O0, -fno-inline */
#    define XXH_NO_INLINE_HINTS 1
#  else
#    define XXH_NO_INLINE_HINTS 0
#  endif
#endif

#ifndef XXH3_INLINE_SECRET
#  if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \
     || !defined(XXH_INLINE_ALL)
#    define XXH3_INLINE_SECRET 0
#  else
#    define XXH3_INLINE_SECRET 1
#  endif
#endif

#ifndef XXH32_ENDJMP
/* generally preferable for performance */
#  define XXH32_ENDJMP 0
#endif

/*!
 * @defgroup impl Implementation
 * @{
 */


/* *************************************
*  Includes & Memory related functions
***************************************/
#if defined(XXH_NO_STREAM)
/* nothing */
#elif defined(XXH_NO_STDLIB)

/* When requesting to disable any mention of stdlib,
 * the library loses the ability to invoked malloc / free.
 * In practice, it means that functions like `XXH*_createState()`
 * will always fail, and return NULL.
 * This flag is useful in situations where
 * xxhash.h is integrated into some kernel, embedded or limited environment
 * without access to dynamic allocation.
 */

static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; }
static void XXH_free(void* p) { (void)p; }

#else

/*
 * Modify the local functions below should you wish to use
 * different memory routines for malloc() and free()
 */
#include <stdlib.h>

/*!
 * @internal
 * @brief Modify this function to use a different routine than malloc().
 */
static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); }

/*!
 * @internal
 * @brief Modify this function to use a different routine than free().
 */
static void XXH_free(void* p) { free(p); }

#endif  /* XXH_NO_STDLIB */

#include <string.h>

/*!
 * @internal
 * @brief Modify this function to use a different routine than memcpy().
 */
static void* XXH_memcpy(void* dest, const void* src, size_t size)
{
    return memcpy(dest,src,size);
}

#include <limits.h>   /* ULLONG_MAX */


/* *************************************
*  Compiler Specific Options
***************************************/
#ifdef _MSC_VER /* Visual Studio warning fix */
#  pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
#endif

#if XXH_NO_INLINE_HINTS  /* disable inlining hints */
#  if defined(__GNUC__) || defined(__clang__)
#    define XXH_FORCE_INLINE static __attribute__((__unused__))
#  else
#    define XXH_FORCE_INLINE static
#  endif
#  define XXH_NO_INLINE static
/* enable inlining hints */
#elif defined(__GNUC__) || defined(__clang__)
#  define XXH_FORCE_INLINE static __inline__ __attribute__((__always_inline__, __unused__))
#  define XXH_NO_INLINE static __attribute__((__noinline__))
#elif defined(_MSC_VER)  /* Visual Studio */
#  define XXH_FORCE_INLINE static __forceinline
#  define XXH_NO_INLINE static __declspec(noinline)
#elif defined (__cplusplus) \
  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* C99 */
#  define XXH_FORCE_INLINE static inline
#  define XXH_NO_INLINE static
#else
#  define XXH_FORCE_INLINE static
#  define XXH_NO_INLINE static
#endif

#if defined(XXH_INLINE_ALL)
#  define XXH_STATIC XXH_FORCE_INLINE
#else
#  define XXH_STATIC static
#endif

#if XXH3_INLINE_SECRET
#  define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE
#else
#  define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE
#endif

#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
#  define XXH_RESTRICT   /* disable */
#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
#  define XXH_RESTRICT   restrict
#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
   || (defined (__clang__)) \
   || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \
   || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300))
/*
 * There are a LOT more compilers that recognize __restrict but this
 * covers the major ones.
 */
#  define XXH_RESTRICT   __restrict
#else
#  define XXH_RESTRICT   /* disable */
#endif

/* *************************************
*  Debug
***************************************/
/*!
 * @ingroup tuning
 * @def XXH_DEBUGLEVEL
 * @brief Sets the debugging level.
 *
 * XXH_DEBUGLEVEL is expected to be defined externally, typically via the
 * compiler's command line options. The value must be a number.
 */
#ifndef XXH_DEBUGLEVEL
#  ifdef DEBUGLEVEL /* backwards compat */
#    define XXH_DEBUGLEVEL DEBUGLEVEL
#  else
#    define XXH_DEBUGLEVEL 0
#  endif
#endif

#if (XXH_DEBUGLEVEL>=1)
#  include <assert.h>   /* note: can still be disabled with NDEBUG */
#  define XXH_ASSERT(c)   assert(c)
#else
#  if defined(__INTEL_COMPILER)
#    define XXH_ASSERT(c)   XXH_ASSUME((unsigned char) (c))
#  else
#    define XXH_ASSERT(c)   XXH_ASSUME(c)
#  endif
#endif

/* note: use after variable declarations */
#ifndef XXH_STATIC_ASSERT
#  if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)    /* C11 */
#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0)
#  elif defined(__cplusplus) && (__cplusplus >= 201103L)            /* C++11 */
#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)
#  else
#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0)
#  endif
#  define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c)
#endif

/*!
 * @internal
 * @def XXH_COMPILER_GUARD(var)
 * @brief Used to prevent unwanted optimizations for @p var.
 *
 * It uses an empty GCC inline assembly statement with a register constraint
 * which forces @p var into a general purpose register (eg eax, ebx, ecx
 * on x86) and marks it as modified.
 *
 * This is used in a few places to avoid unwanted autovectorization (e.g.
 * XXH32_round()). All vectorization we want is explicit via intrinsics,
 * and _usually_ isn't wanted elsewhere.
 *
 * We also use it to prevent unwanted constant folding for AArch64 in
 * XXH3_initCustomSecret_scalar().
 */
#if defined(__GNUC__) || defined(__clang__)
#  define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var))
#else
#  define XXH_COMPILER_GUARD(var) ((void)0)
#endif

/* Specifically for NEON vectors which use the "w" constraint, on
 * Clang. */
#if defined(__clang__) && defined(__ARM_ARCH) && !defined(__wasm__)
#  define XXH_COMPILER_GUARD_CLANG_NEON(var) __asm__("" : "+w" (var))
#else
#  define XXH_COMPILER_GUARD_CLANG_NEON(var) ((void)0)
#endif

/* *************************************
*  Basic Types
***************************************/
#if !defined (__VMS) \
 && (defined (__cplusplus) \
 || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
#   ifdef _AIX
#     include <inttypes.h>
#   else
#     include <stdint.h>
#   endif
    typedef uint8_t xxh_u8;
#else
    typedef unsigned char xxh_u8;
#endif
typedef XXH32_hash_t xxh_u32;

#ifdef XXH_OLD_NAMES
#  warning "XXH_OLD_NAMES is planned to be removed starting v0.9. If the program depends on it, consider moving away from it by employing newer type names directly"
#  define BYTE xxh_u8
#  define U8   xxh_u8
#  define U32  xxh_u32
#endif

/* ***   Memory access   *** */

/*!
 * @internal
 * @fn xxh_u32 XXH_read32(const void* ptr)
 * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.
 *
 * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
 *
 * @param ptr The pointer to read from.
 * @return The 32-bit native endian integer from the bytes at @p ptr.
 */

/*!
 * @internal
 * @fn xxh_u32 XXH_readLE32(const void* ptr)
 * @brief Reads an unaligned 32-bit little endian integer from @p ptr.
 *
 * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
 *
 * @param ptr The pointer to read from.
 * @return The 32-bit little endian integer from the bytes at @p ptr.
 */

/*!
 * @internal
 * @fn xxh_u32 XXH_readBE32(const void* ptr)
 * @brief Reads an unaligned 32-bit big endian integer from @p ptr.
 *
 * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
 *
 * @param ptr The pointer to read from.
 * @return The 32-bit big endian integer from the bytes at @p ptr.
 */

/*!
 * @internal
 * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)
 * @brief Like @ref XXH_readLE32(), but has an option for aligned reads.
 *
 * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
 * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is
 * always @ref XXH_alignment::XXH_unaligned.
 *
 * @param ptr The pointer to read from.
 * @param align Whether @p ptr is aligned.
 * @pre
 *   If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte
 *   aligned.
 * @return The 32-bit little endian integer from the bytes at @p ptr.
 */

#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
/*
 * Manual byteshift. Best for old compilers which don't inline memcpy.
 * We actually directly use XXH_readLE32 and XXH_readBE32.
 */
#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))

/*
 * Force direct memory access. Only works on CPU which support unaligned memory
 * access in hardware.
 */
static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }

#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))

/*
 * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
 * documentation claimed that it only increased the alignment, but actually it
 * can decrease it on gcc, clang, and icc:
 * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
 * https://gcc.godbolt.org/z/xYez1j67Y.
 */
#ifdef XXH_OLD_NAMES
typedef union { xxh_u32 u32; } __attribute__((__packed__)) unalign;
#endif
static xxh_u32 XXH_read32(const void* ptr)
{
    typedef __attribute__((__aligned__(1))) xxh_u32 xxh_unalign32;
    return *((const xxh_unalign32*)ptr);
}

#else

/*
 * Portable and safe solution. Generally efficient.
 * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
 */
static xxh_u32 XXH_read32(const void* memPtr)
{
    xxh_u32 val;
    XXH_memcpy(&val, memPtr, sizeof(val));
    return val;
}

#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */


/* ***   Endianness   *** */

/*!
 * @ingroup tuning
 * @def XXH_CPU_LITTLE_ENDIAN
 * @brief Whether the target is little endian.
 *
 * Defined to 1 if the target is little endian, or 0 if it is big endian.
 * It can be defined externally, for example on the compiler command line.
 *
 * If it is not defined,
 * a runtime check (which is usually constant folded) is used instead.
 *
 * @note
 *   This is not necessarily defined to an integer constant.
 *
 * @see XXH_isLittleEndian() for the runtime check.
 */
#ifndef XXH_CPU_LITTLE_ENDIAN
/*
 * Try to detect endianness automatically, to avoid the nonstandard behavior
 * in `XXH_isLittleEndian()`
 */
#  if defined(_WIN32) /* Windows is always little endian */ \
     || defined(__LITTLE_ENDIAN__) \
     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
#    define XXH_CPU_LITTLE_ENDIAN 1
#  elif defined(__BIG_ENDIAN__) \
     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
#    define XXH_CPU_LITTLE_ENDIAN 0
#  else
/*!
 * @internal
 * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.
 *
 * Most compilers will constant fold this.
 */
static int XXH_isLittleEndian(void)
{
    /*
     * Portable and well-defined behavior.
     * Don't use static: it is detrimental to performance.
     */
    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };
    return one.c[0];
}
#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
#  endif
#endif


/* ****************************************
*  Compiler-specific Functions and Macros
******************************************/
#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)

#ifdef __has_builtin
#  define XXH_HAS_BUILTIN(x) __has_builtin(x)
#else
#  define XXH_HAS_BUILTIN(x) 0
#endif


/*
 * C23 and future versions have standard "unreachable()".
 * Once it has been implemented reliably we can add it as an
 * additional case:
 *
 * ```
 * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN)
 * #  include <stddef.h>
 * #  ifdef unreachable
 * #    define XXH_UNREACHABLE() unreachable()
 * #  endif
 * #endif
 * ```
 *
 * Note C++23 also has std::unreachable() which can be detected
 * as follows:
 * ```
 * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L)
 * #  include <utility>
 * #  define XXH_UNREACHABLE() std::unreachable()
 * #endif
 * ```
 * NB: `__cpp_lib_unreachable` is defined in the `<version>` header.
 * We don't use that as including `<utility>` in `extern "C"` blocks
 * doesn't work on GCC12
 */

#if XXH_HAS_BUILTIN(__builtin_unreachable)
#  define XXH_UNREACHABLE() __builtin_unreachable()

#elif defined(_MSC_VER)
#  define XXH_UNREACHABLE() __assume(0)

#else
#  define XXH_UNREACHABLE()
#endif

#if XXH_HAS_BUILTIN(__builtin_assume)
#  define XXH_ASSUME(c) __builtin_assume(c)
#else
#  define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); }
#endif

/*!
 * @internal
 * @def XXH_rotl32(x,r)
 * @brief 32-bit rotate left.
 *
 * @param x The 32-bit integer to be rotated.
 * @param r The number of bits to rotate.
 * @pre
 *   @p r > 0 && @p r < 32
 * @note
 *   @p x and @p r may be evaluated multiple times.
 * @return The rotated result.
 */
#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \
                               && XXH_HAS_BUILTIN(__builtin_rotateleft64)
#  define XXH_rotl32 __builtin_rotateleft32
#  define XXH_rotl64 __builtin_rotateleft64
#elif XXH_HAS_BUILTIN(__builtin_stdc_rotate_left)
#  define XXH_rotl32 __builtin_stdc_rotate_left
#  define XXH_rotl64 __builtin_stdc_rotate_left
/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
#elif defined(_MSC_VER)
#  define XXH_rotl32(x,r) _rotl(x,r)
#  define XXH_rotl64(x,r) _rotl64(x,r)
#else
#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
#endif

/*!
 * @internal
 * @fn xxh_u32 XXH_swap32(xxh_u32 x)
 * @brief A 32-bit byteswap.
 *
 * @param x The 32-bit integer to byteswap.
 * @return @p x, byteswapped.
 */
#if defined(_MSC_VER)     /* Visual Studio */
#  define XXH_swap32 _byteswap_ulong
#elif XXH_GCC_VERSION >= 403
#  define XXH_swap32 __builtin_bswap32
#else
static xxh_u32 XXH_swap32 (xxh_u32 x)
{
    return  ((x << 24) & 0xff000000 ) |
            ((x <<  8) & 0x00ff0000 ) |
            ((x >>  8) & 0x0000ff00 ) |
            ((x >> 24) & 0x000000ff );
}
#endif


/* ***************************
*  Memory reads
*****************************/

/*!
 * @internal
 * @brief Enum to indicate whether a pointer is aligned.
 */
typedef enum {
    XXH_aligned,  /*!< Aligned */
    XXH_unaligned /*!< Possibly unaligned */
} XXH_alignment;

/*
 * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
 *
 * This is ideal for older compilers which don't inline memcpy.
 */
#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))

XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)
{
    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
    return bytePtr[0]
         | ((xxh_u32)bytePtr[1] << 8)
         | ((xxh_u32)bytePtr[2] << 16)
         | ((xxh_u32)bytePtr[3] << 24);
}

XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)
{
    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
    return bytePtr[3]
         | ((xxh_u32)bytePtr[2] << 8)
         | ((xxh_u32)bytePtr[1] << 16)
         | ((xxh_u32)bytePtr[0] << 24);
}

#else
XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
{
    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
}

static xxh_u32 XXH_readBE32(const void* ptr)
{
    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
}
#endif

XXH_FORCE_INLINE xxh_u32
XXH_readLE32_align(const void* ptr, XXH_alignment align)
{
    if (align==XXH_unaligned) {
        return XXH_readLE32(ptr);
    } else {
        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
    }
}


/* *************************************
*  Misc
***************************************/
/*! @ingroup public */
XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }


/* *******************************************************************
*  32-bit hash functions
*********************************************************************/
/*!
 * @}
 * @defgroup XXH32_impl XXH32 implementation
 * @ingroup impl
 *
 * Details on the XXH32 implementation.
 * @{
 */
 /* #define instead of static const, to be used as initializers */
#define XXH_PRIME32_1  0x9E3779B1U  /*!< 0b10011110001101110111100110110001 */
#define XXH_PRIME32_2  0x85EBCA77U  /*!< 0b10000101111010111100101001110111 */
#define XXH_PRIME32_3  0xC2B2AE3DU  /*!< 0b11000010101100101010111000111101 */
#define XXH_PRIME32_4  0x27D4EB2FU  /*!< 0b00100111110101001110101100101111 */
#define XXH_PRIME32_5  0x165667B1U  /*!< 0b00010110010101100110011110110001 */

#ifdef XXH_OLD_NAMES
#  define PRIME32_1 XXH_PRIME32_1
#  define PRIME32_2 XXH_PRIME32_2
#  define PRIME32_3 XXH_PRIME32_3
#  define PRIME32_4 XXH_PRIME32_4
#  define PRIME32_5 XXH_PRIME32_5
#endif

/*!
 * @internal
 * @brief Normal stripe processing routine.
 *
 * This shuffles the bits so that any bit from @p input impacts several bits in
 * @p acc.
 *
 * @param acc The accumulator lane.
 * @param input The stripe of input to mix.
 * @return The mixed accumulator lane.
 */
static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
{
    acc += input * XXH_PRIME32_2;
    acc  = XXH_rotl32(acc, 13);
    acc *= XXH_PRIME32_1;
#if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
    /*
     * UGLY HACK:
     * A compiler fence is used to prevent GCC and Clang from
     * autovectorizing the XXH32 loop (pragmas and attributes don't work for some
     * reason) without globally disabling SSE4.1.
     *
     * The reason we want to avoid vectorization is because despite working on
     * 4 integers at a time, there are multiple factors slowing XXH32 down on
     * SSE4:
     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
     *   newer chips!) making it slightly slower to multiply four integers at
     *   once compared to four integers independently. Even when pmulld was
     *   fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
     *   just to multiply unless doing a long operation.
     *
     * - Four instructions are required to rotate,
     *      movqda tmp,  v // not required with VEX encoding
     *      pslld  tmp, 13 // tmp <<= 13
     *      psrld  v,   19 // x >>= 19
     *      por    v,  tmp // x |= tmp
     *   compared to one for scalar:
     *      roll   v, 13    // reliably fast across the board
     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
     *
     * - Instruction level parallelism is actually more beneficial here because
     *   the SIMD actually serializes this operation: While v1 is rotating, v2
     *   can load data, while v3 can multiply. SSE forces them to operate
     *   together.
     *
     * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing
     * the loop. NEON is only faster on the A53, and with the newer cores, it is less
     * than half the speed.
     *
     * Additionally, this is used on WASM SIMD128 because it JITs to the same
     * SIMD instructions and has the same issue.
     */
    XXH_COMPILER_GUARD(acc);
#endif
    return acc;
}

/*!
 * @internal
 * @brief Mixes all bits to finalize the hash.
 *
 * The final mix ensures that all input bits have a chance to impact any bit in
 * the output digest, resulting in an unbiased distribution.
 *
 * @param hash The hash to avalanche.
 * @return The avalanched hash.
 */
static xxh_u32 XXH32_avalanche(xxh_u32 hash)
{
    hash ^= hash >> 15;
    hash *= XXH_PRIME32_2;
    hash ^= hash >> 13;
    hash *= XXH_PRIME32_3;
    hash ^= hash >> 16;
    return hash;
}

#define XXH_get32bits(p) XXH_readLE32_align(p, align)

/*!
 * @internal
 * @brief Sets up the initial accumulator state for XXH32().
 */
XXH_FORCE_INLINE void
XXH32_initAccs(xxh_u32 *acc, xxh_u32 seed)
{
    XXH_ASSERT(acc != NULL);
    acc[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
    acc[1] = seed + XXH_PRIME32_2;
    acc[2] = seed + 0;
    acc[3] = seed - XXH_PRIME32_1;
}

/*!
 * @internal
 * @brief Consumes a block of data for XXH32().
 *
 * @return the end input pointer.
 */
XXH_FORCE_INLINE const xxh_u8 *
XXH32_consumeLong(
    xxh_u32 *XXH_RESTRICT acc,
    xxh_u8 const *XXH_RESTRICT input,
    size_t len,
    XXH_alignment align
)
{
    const xxh_u8* const bEnd = input + len;
    const xxh_u8* const limit = bEnd - 15;
    XXH_ASSERT(acc != NULL);
    XXH_ASSERT(input != NULL);
    XXH_ASSERT(len >= 16);
    do {
        acc[0] = XXH32_round(acc[0], XXH_get32bits(input)); input += 4;
        acc[1] = XXH32_round(acc[1], XXH_get32bits(input)); input += 4;
        acc[2] = XXH32_round(acc[2], XXH_get32bits(input)); input += 4;
        acc[3] = XXH32_round(acc[3], XXH_get32bits(input)); input += 4;
    } while (input < limit);

    return input;
}

/*!
 * @internal
 * @brief Merges the accumulator lanes together for XXH32()
 */
XXH_FORCE_INLINE XXH_PUREF xxh_u32
XXH32_mergeAccs(const xxh_u32 *acc)
{
    XXH_ASSERT(acc != NULL);
    return XXH_rotl32(acc[0], 1)  + XXH_rotl32(acc[1], 7)
         + XXH_rotl32(acc[2], 12) + XXH_rotl32(acc[3], 18);
}

/*!
 * @internal
 * @brief Processes the last 0-15 bytes of @p ptr.
 *
 * There may be up to 15 bytes remaining to consume from the input.
 * This final stage will digest them to ensure that all input bytes are present
 * in the final mix.
 *
 * @param hash The hash to finalize.
 * @param ptr The pointer to the remaining input.
 * @param len The remaining length, modulo 16.
 * @param align Whether @p ptr is aligned.
 * @return The finalized hash.
 * @see XXH64_finalize().
 */
static XXH_PUREF xxh_u32
XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
{
#define XXH_PROCESS1 do {                             \
    hash += (*ptr++) * XXH_PRIME32_5;                 \
    hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1;      \
} while (0)

#define XXH_PROCESS4 do {                             \
    hash += XXH_get32bits(ptr) * XXH_PRIME32_3;       \
    ptr += 4;                                         \
    hash  = XXH_rotl32(hash, 17) * XXH_PRIME32_4;     \
} while (0)

    if (ptr==NULL) XXH_ASSERT(len == 0);

    /* Compact rerolled version; generally faster */
    if (!XXH32_ENDJMP) {
        len &= 15;
        while (len >= 4) {
            XXH_PROCESS4;
            len -= 4;
        }
        while (len > 0) {
            XXH_PROCESS1;
            --len;
        }
        return XXH32_avalanche(hash);
    } else {
         switch(len&15) /* or switch(bEnd - p) */ {
           case 12:      XXH_PROCESS4;
                         XXH_FALLTHROUGH;  /* fallthrough */
           case 8:       XXH_PROCESS4;
                         XXH_FALLTHROUGH;  /* fallthrough */
           case 4:       XXH_PROCESS4;
                         return XXH32_avalanche(hash);

           case 13:      XXH_PROCESS4;
                         XXH_FALLTHROUGH;  /* fallthrough */
           case 9:       XXH_PROCESS4;
                         XXH_FALLTHROUGH;  /* fallthrough */
           case 5:       XXH_PROCESS4;
                         XXH_PROCESS1;
                         return XXH32_avalanche(hash);

           case 14:      XXH_PROCESS4;
                         XXH_FALLTHROUGH;  /* fallthrough */
           case 10:      XXH_PROCESS4;
                         XXH_FALLTHROUGH;  /* fallthrough */
           case 6:       XXH_PROCESS4;
                         XXH_PROCESS1;
                         XXH_PROCESS1;
                         return XXH32_avalanche(hash);

           case 15:      XXH_PROCESS4;
                         XXH_FALLTHROUGH;  /* fallthrough */
           case 11:      XXH_PROCESS4;
                         XXH_FALLTHROUGH;  /* fallthrough */
           case 7:       XXH_PROCESS4;
                         XXH_FALLTHROUGH;  /* fallthrough */
           case 3:       XXH_PROCESS1;
                         XXH_FALLTHROUGH;  /* fallthrough */
           case 2:       XXH_PROCESS1;
                         XXH_FALLTHROUGH;  /* fallthrough */
           case 1:       XXH_PROCESS1;
                         XXH_FALLTHROUGH;  /* fallthrough */
           case 0:       return XXH32_avalanche(hash);
        }
        XXH_ASSERT(0);
        return hash;   /* reaching this point is deemed impossible */
    }
}

#ifdef XXH_OLD_NAMES
#  define PROCESS1 XXH_PROCESS1
#  define PROCESS4 XXH_PROCESS4
#else
#  undef XXH_PROCESS1
#  undef XXH_PROCESS4
#endif

/*!
 * @internal
 * @brief The implementation for @ref XXH32().
 *
 * @param input , len , seed Directly passed from @ref XXH32().
 * @param align Whether @p input is aligned.
 * @return The calculated hash.
 */
XXH_FORCE_INLINE XXH_PUREF xxh_u32
XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
{
    xxh_u32 h32;

    if (input==NULL) XXH_ASSERT(len == 0);

    if (len>=16) {
        xxh_u32 acc[4];
        XXH32_initAccs(acc, seed);

        input = XXH32_consumeLong(acc, input, len, align);

        h32 = XXH32_mergeAccs(acc);
    } else {
        h32  = seed + XXH_PRIME32_5;
    }

    h32 += (xxh_u32)len;

    return XXH32_finalize(h32, input, len&15, align);
}

/*! @ingroup XXH32_family */
XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
{
#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
    XXH32_state_t state;
    XXH32_reset(&state, seed);
    XXH32_update(&state, (const xxh_u8*)input, len);
    return XXH32_digest(&state);
#else
    if (XXH_FORCE_ALIGN_CHECK) {
        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
            return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
    }   }

    return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
#endif
}


/*******   Hash streaming   *******/
#ifndef XXH_NO_STREAM
/*! @ingroup XXH32_family */
XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
{
    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
}
/*! @ingroup XXH32_family */
XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
{
    XXH_free(statePtr);
    return XXH_OK;
}

/*! @ingroup XXH32_family */
XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
{
    XXH_memcpy(dstState, srcState, sizeof(*dstState));
}

/*! @ingroup XXH32_family */
XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
{
    XXH_ASSERT(statePtr != NULL);
    memset(statePtr, 0, sizeof(*statePtr));
    XXH32_initAccs(statePtr->acc, seed);
    return XXH_OK;
}


/*! @ingroup XXH32_family */
XXH_PUBLIC_API XXH_errorcode
XXH32_update(XXH32_state_t* state, const void* input, size_t len)
{
    if (input==NULL) {
        XXH_ASSERT(len == 0);
        return XXH_OK;
    }

    state->total_len_32 += (XXH32_hash_t)len;
    state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));

    XXH_ASSERT(state->bufferedSize < sizeof(state->buffer));
    if (len < sizeof(state->buffer) - state->bufferedSize)  {   /* fill in tmp buffer */
        XXH_memcpy(state->buffer + state->bufferedSize, input, len);
        state->bufferedSize += (XXH32_hash_t)len;
        return XXH_OK;
    }

    {   const xxh_u8* xinput = (const xxh_u8*)input;
        const xxh_u8* const bEnd = xinput + len;

        if (state->bufferedSize) {   /* non-empty buffer: complete first */
            XXH_memcpy(state->buffer + state->bufferedSize, xinput, sizeof(state->buffer) - state->bufferedSize);
            xinput += sizeof(state->buffer) - state->bufferedSize;
            /* then process one round */
            (void)XXH32_consumeLong(state->acc, state->buffer, sizeof(state->buffer), XXH_aligned);
            state->bufferedSize = 0;
        }

        XXH_ASSERT(xinput <= bEnd);
        if ((size_t)(bEnd - xinput) >= sizeof(state->buffer)) {
            /* Process the remaining data */
            xinput = XXH32_consumeLong(state->acc, xinput, (size_t)(bEnd - xinput), XXH_unaligned);
        }

        if (xinput < bEnd) {
            /* Copy the leftover to the tmp buffer */
            XXH_memcpy(state->buffer, xinput, (size_t)(bEnd-xinput));
            state->bufferedSize = (unsigned)(bEnd-xinput);
        }
    }

    return XXH_OK;
}


/*! @ingroup XXH32_family */
XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)
{
    xxh_u32 h32;

    if (state->large_len) {
        h32 = XXH32_mergeAccs(state->acc);
    } else {
        h32 = state->acc[2] /* == seed */ + XXH_PRIME32_5;
    }

    h32 += state->total_len_32;

    return XXH32_finalize(h32, state->buffer, state->bufferedSize, XXH_aligned);
}
#endif /* !XXH_NO_STREAM */

/*******   Canonical representation   *******/

/*! @ingroup XXH32_family */
XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
{
    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
    XXH_memcpy(dst, &hash, sizeof(*dst));
}
/*! @ingroup XXH32_family */
XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
{
    return XXH_readBE32(src);
}


#ifndef XXH_NO_LONG_LONG

/* *******************************************************************
*  64-bit hash functions
*********************************************************************/
/*!
 * @}
 * @ingroup impl
 * @{
 */
/*******   Memory access   *******/

typedef XXH64_hash_t xxh_u64;

#ifdef XXH_OLD_NAMES
#  define U64 xxh_u64
#endif

#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
/*
 * Manual byteshift. Best for old compilers which don't inline memcpy.
 * We actually directly use XXH_readLE64 and XXH_readBE64.
 */
#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))

/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
static xxh_u64 XXH_read64(const void* memPtr)
{
    return *(const xxh_u64*) memPtr;
}

#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))

/*
 * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
 * documentation claimed that it only increased the alignment, but actually it
 * can decrease it on gcc, clang, and icc:
 * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
 * https://gcc.godbolt.org/z/xYez1j67Y.
 */
#ifdef XXH_OLD_NAMES
typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((__packed__)) unalign64;
#endif
static xxh_u64 XXH_read64(const void* ptr)
{
    typedef __attribute__((__aligned__(1))) xxh_u64 xxh_unalign64;
    return *((const xxh_unalign64*)ptr);
}

#else

/*
 * Portable and safe solution. Generally efficient.
 * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
 */
static xxh_u64 XXH_read64(const void* memPtr)
{
    xxh_u64 val;
    XXH_memcpy(&val, memPtr, sizeof(val));
    return val;
}

#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */

#if defined(_MSC_VER)     /* Visual Studio */
#  define XXH_swap64 _byteswap_uint64
#elif XXH_GCC_VERSION >= 403
#  define XXH_swap64 __builtin_bswap64
#else
static xxh_u64 XXH_swap64(xxh_u64 x)
{
    return  ((x << 56) & 0xff00000000000000ULL) |
            ((x << 40) & 0x00ff000000000000ULL) |
            ((x << 24) & 0x0000ff0000000000ULL) |
            ((x << 8)  & 0x000000ff00000000ULL) |
            ((x >> 8)  & 0x00000000ff000000ULL) |
            ((x >> 24) & 0x0000000000ff0000ULL) |
            ((x >> 40) & 0x000000000000ff00ULL) |
            ((x >> 56) & 0x00000000000000ffULL);
}
#endif


/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))

XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)
{
    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
    return bytePtr[0]
         | ((xxh_u64)bytePtr[1] << 8)
         | ((xxh_u64)bytePtr[2] << 16)
         | ((xxh_u64)bytePtr[3] << 24)
         | ((xxh_u64)bytePtr[4] << 32)
         | ((xxh_u64)bytePtr[5] << 40)
         | ((xxh_u64)bytePtr[6] << 48)
         | ((xxh_u64)bytePtr[7] << 56);
}

XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)
{
    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
    return bytePtr[7]
         | ((xxh_u64)bytePtr[6] << 8)
         | ((xxh_u64)bytePtr[5] << 16)
         | ((xxh_u64)bytePtr[4] << 24)
         | ((xxh_u64)bytePtr[3] << 32)
         | ((xxh_u64)bytePtr[2] << 40)
         | ((xxh_u64)bytePtr[1] << 48)
         | ((xxh_u64)bytePtr[0] << 56);
}

#else
XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
{
    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
}

static xxh_u64 XXH_readBE64(const void* ptr)
{
    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
}
#endif

XXH_FORCE_INLINE xxh_u64
XXH_readLE64_align(const void* ptr, XXH_alignment align)
{
    if (align==XXH_unaligned)
        return XXH_readLE64(ptr);
    else
        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
}


/*******   xxh64   *******/
/*!
 * @}
 * @defgroup XXH64_impl XXH64 implementation
 * @ingroup impl
 *
 * Details on the XXH64 implementation.
 * @{
 */
/* #define rather that static const, to be used as initializers */
#define XXH_PRIME64_1  0x9E3779B185EBCA87ULL  /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */
#define XXH_PRIME64_2  0xC2B2AE3D27D4EB4FULL  /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */
#define XXH_PRIME64_3  0x165667B19E3779F9ULL  /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */
#define XXH_PRIME64_4  0x85EBCA77C2B2AE63ULL  /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */
#define XXH_PRIME64_5  0x27D4EB2F165667C5ULL  /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */

#ifdef XXH_OLD_NAMES
#  define PRIME64_1 XXH_PRIME64_1
#  define PRIME64_2 XXH_PRIME64_2
#  define PRIME64_3 XXH_PRIME64_3
#  define PRIME64_4 XXH_PRIME64_4
#  define PRIME64_5 XXH_PRIME64_5
#endif

/*! @copydoc XXH32_round */
static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
{
    acc += input * XXH_PRIME64_2;
    acc  = XXH_rotl64(acc, 31);
    acc *= XXH_PRIME64_1;
#if (defined(__AVX512F__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
    /*
     * DISABLE AUTOVECTORIZATION:
     * A compiler fence is used to prevent GCC and Clang from
     * autovectorizing the XXH64 loop (pragmas and attributes don't work for some
     * reason) without globally disabling AVX512.
     *
     * Autovectorization of XXH64 tends to be detrimental,
     * though the exact outcome may change depending on exact cpu and compiler version.
     * For information, it has been reported as detrimental for Skylake-X,
     * but possibly beneficial for Zen4.
     *
     * The default is to disable auto-vectorization,
     * but you can select to enable it instead using `XXH_ENABLE_AUTOVECTORIZE` build variable.
     */
    XXH_COMPILER_GUARD(acc);
#endif
    return acc;
}

static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
{
    val  = XXH64_round(0, val);
    acc ^= val;
    acc  = acc * XXH_PRIME64_1 + XXH_PRIME64_4;
    return acc;
}

/*! @copydoc XXH32_avalanche */
static xxh_u64 XXH64_avalanche(xxh_u64 hash)
{
    hash ^= hash >> 33;
    hash *= XXH_PRIME64_2;
    hash ^= hash >> 29;
    hash *= XXH_PRIME64_3;
    hash ^= hash >> 32;
    return hash;
}


#define XXH_get64bits(p) XXH_readLE64_align(p, align)

/*!
 * @internal
 * @brief Sets up the initial accumulator state for XXH64().
 */
XXH_FORCE_INLINE void
XXH64_initAccs(xxh_u64 *acc, xxh_u64 seed)
{
    XXH_ASSERT(acc != NULL);
    acc[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
    acc[1] = seed + XXH_PRIME64_2;
    acc[2] = seed + 0;
    acc[3] = seed - XXH_PRIME64_1;
}

/*!
 * @internal
 * @brief Consumes a block of data for XXH64().
 *
 * @return the end input pointer.
 */
XXH_FORCE_INLINE const xxh_u8 *
XXH64_consumeLong(
    xxh_u64 *XXH_RESTRICT acc,
    xxh_u8 const *XXH_RESTRICT input,
    size_t len,
    XXH_alignment align
)
{
    const xxh_u8* const bEnd = input + len;
    const xxh_u8* const limit = bEnd - 31;
    XXH_ASSERT(acc != NULL);
    XXH_ASSERT(input != NULL);
    XXH_ASSERT(len >= 32);
    do {
        /* reroll on 32-bit */
        if (sizeof(void *) < sizeof(xxh_u64)) {
            size_t i;
            for (i = 0; i < 4; i++) {
                acc[i] = XXH64_round(acc[i], XXH_get64bits(input));
                input += 8;
            }
        } else {
            acc[0] = XXH64_round(acc[0], XXH_get64bits(input)); input += 8;
            acc[1] = XXH64_round(acc[1], XXH_get64bits(input)); input += 8;
            acc[2] = XXH64_round(acc[2], XXH_get64bits(input)); input += 8;
            acc[3] = XXH64_round(acc[3], XXH_get64bits(input)); input += 8;
        }
    } while (input < limit);

    return input;
}

/*!
 * @internal
 * @brief Merges the accumulator lanes together for XXH64()
 */
XXH_FORCE_INLINE XXH_PUREF xxh_u64
XXH64_mergeAccs(const xxh_u64 *acc)
{
    XXH_ASSERT(acc != NULL);
    {
        xxh_u64 h64 = XXH_rotl64(acc[0], 1) + XXH_rotl64(acc[1], 7)
                    + XXH_rotl64(acc[2], 12) + XXH_rotl64(acc[3], 18);
        /* reroll on 32-bit */
        if (sizeof(void *) < sizeof(xxh_u64)) {
            size_t i;
            for (i = 0; i < 4; i++) {
                h64 = XXH64_mergeRound(h64, acc[i]);
            }
        } else {
            h64 = XXH64_mergeRound(h64, acc[0]);
            h64 = XXH64_mergeRound(h64, acc[1]);
            h64 = XXH64_mergeRound(h64, acc[2]);
            h64 = XXH64_mergeRound(h64, acc[3]);
        }
        return h64;
    }
}

/*!
 * @internal
 * @brief Processes the last 0-31 bytes of @p ptr.
 *
 * There may be up to 31 bytes remaining to consume from the input.
 * This final stage will digest them to ensure that all input bytes are present
 * in the final mix.
 *
 * @param hash The hash to finalize.
 * @param ptr The pointer to the remaining input.
 * @param len The remaining length, modulo 32.
 * @param align Whether @p ptr is aligned.
 * @return The finalized hash
 * @see XXH32_finalize().
 */
XXH_STATIC XXH_PUREF xxh_u64
XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
{
    if (ptr==NULL) XXH_ASSERT(len == 0);
    len &= 31;
    while (len >= 8) {
        xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
        ptr += 8;
        hash ^= k1;
        hash  = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4;
        len -= 8;
    }
    if (len >= 4) {
        hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
        ptr += 4;
        hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
        len -= 4;
    }
    while (len > 0) {
        hash ^= (*ptr++) * XXH_PRIME64_5;
        hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1;
        --len;
    }
    return  XXH64_avalanche(hash);
}

#ifdef XXH_OLD_NAMES
#  define PROCESS1_64 XXH_PROCESS1_64
#  define PROCESS4_64 XXH_PROCESS4_64
#  define PROCESS8_64 XXH_PROCESS8_64
#else
#  undef XXH_PROCESS1_64
#  undef XXH_PROCESS4_64
#  undef XXH_PROCESS8_64
#endif

/*!
 * @internal
 * @brief The implementation for @ref XXH64().
 *
 * @param input , len , seed Directly passed from @ref XXH64().
 * @param align Whether @p input is aligned.
 * @return The calculated hash.
 */
XXH_FORCE_INLINE XXH_PUREF xxh_u64
XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
{
    xxh_u64 h64;
    if (input==NULL) XXH_ASSERT(len == 0);

    if (len>=32) {  /* Process a large block of data */
        xxh_u64 acc[4];
        XXH64_initAccs(acc, seed);

        input = XXH64_consumeLong(acc, input, len, align);

        h64 = XXH64_mergeAccs(acc);
    } else {
        h64  = seed + XXH_PRIME64_5;
    }

    h64 += (xxh_u64) len;

    return XXH64_finalize(h64, input, len, align);
}


/*! @ingroup XXH64_family */
XXH_PUBLIC_API XXH64_hash_t XXH64 (XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
{
#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
    XXH64_state_t state;
    XXH64_reset(&state, seed);
    XXH64_update(&state, (const xxh_u8*)input, len);
    return XXH64_digest(&state);
#else
    if (XXH_FORCE_ALIGN_CHECK) {
        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
            return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
    }   }

    return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);

#endif
}

/*******   Hash Streaming   *******/
#ifndef XXH_NO_STREAM
/*! @ingroup XXH64_family*/
XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
{
    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
}
/*! @ingroup XXH64_family */
XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
{
    XXH_free(statePtr);
    return XXH_OK;
}

/*! @ingroup XXH64_family */
XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState)
{
    XXH_memcpy(dstState, srcState, sizeof(*dstState));
}

/*! @ingroup XXH64_family */
XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed)
{
    XXH_ASSERT(statePtr != NULL);
    memset(statePtr, 0, sizeof(*statePtr));
    XXH64_initAccs(statePtr->acc, seed);
    return XXH_OK;
}

/*! @ingroup XXH64_family */
XXH_PUBLIC_API XXH_errorcode
XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len)
{
    if (input==NULL) {
        XXH_ASSERT(len == 0);
        return XXH_OK;
    }

    state->total_len += len;

    XXH_ASSERT(state->bufferedSize <= sizeof(state->buffer));
    if (len < sizeof(state->buffer) - state->bufferedSize)  {   /* fill in tmp buffer */
        XXH_memcpy(state->buffer + state->bufferedSize, input, len);
        state->bufferedSize += (XXH32_hash_t)len;
        return XXH_OK;
    }

    {   const xxh_u8* xinput = (const xxh_u8*)input;
        const xxh_u8* const bEnd = xinput + len;

        if (state->bufferedSize) {   /* non-empty buffer => complete first */
            XXH_memcpy(state->buffer + state->bufferedSize, xinput, sizeof(state->buffer) - state->bufferedSize);
            xinput += sizeof(state->buffer) - state->bufferedSize;
            /* and process one round */
            (void)XXH64_consumeLong(state->acc, state->buffer, sizeof(state->buffer), XXH_aligned);
            state->bufferedSize = 0;
        }

        XXH_ASSERT(xinput <= bEnd);
        if ((size_t)(bEnd - xinput) >= sizeof(state->buffer)) {
            /* Process the remaining data */
            xinput = XXH64_consumeLong(state->acc, xinput, (size_t)(bEnd - xinput), XXH_unaligned);
        }

        if (xinput < bEnd) {
            /* Copy the leftover to the tmp buffer */
            XXH_memcpy(state->buffer, xinput, (size_t)(bEnd-xinput));
            state->bufferedSize = (unsigned)(bEnd-xinput);
        }
    }

    return XXH_OK;
}


/*! @ingroup XXH64_family */
XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state)
{
    xxh_u64 h64;

    if (state->total_len >= 32) {
        h64 = XXH64_mergeAccs(state->acc);
    } else {
        h64  = state->acc[2] /*seed*/ + XXH_PRIME64_5;
    }

    h64 += (xxh_u64) state->total_len;

    return XXH64_finalize(h64, state->buffer, (size_t)state->total_len, XXH_aligned);
}
#endif /* !XXH_NO_STREAM */

/******* Canonical representation   *******/

/*! @ingroup XXH64_family */
XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash)
{
    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
    XXH_memcpy(dst, &hash, sizeof(*dst));
}

/*! @ingroup XXH64_family */
XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src)
{
    return XXH_readBE64(src);
}

#ifndef XXH_NO_XXH3

/* *********************************************************************
*  XXH3
*  New generation hash designed for speed on small keys and vectorization
************************************************************************ */
/*!
 * @}
 * @defgroup XXH3_impl XXH3 implementation
 * @ingroup impl
 * @{
 */

/* ===   Compiler specifics   === */


#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
  || defined(__clang__)
#    define XXH_likely(x) __builtin_expect(x, 1)
#    define XXH_unlikely(x) __builtin_expect(x, 0)
#else
#    define XXH_likely(x) (x)
#    define XXH_unlikely(x) (x)
#endif

#ifndef XXH_HAS_INCLUDE
#  ifdef __has_include
/*
 * Not defined as XXH_HAS_INCLUDE(x) (function-like) because
 * this causes segfaults in Apple Clang 4.2 (on Mac OS X 10.7 Lion)
 */
#    define XXH_HAS_INCLUDE __has_include
#  else
#    define XXH_HAS_INCLUDE(x) 0
#  endif
#endif

#if defined(__GNUC__) || defined(__clang__)
#  if defined(__ARM_FEATURE_SVE)
#    include <arm_sve.h>
#  endif
#  if defined(__ARM_NEON__) || defined(__ARM_NEON) \
   || (defined(_M_ARM) && _M_ARM >= 7) \
   || defined(_M_ARM64) || defined(_M_ARM64EC) \
   || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* WASM SIMD128 via SIMDe */
#    define inline __inline__  /* circumvent a clang bug */
#    include <arm_neon.h>
#    undef inline
#  elif defined(__AVX2__)
#    include <immintrin.h>
#  elif defined(__SSE2__)
#    include <emmintrin.h>
#  elif defined(__loongarch_sx)
#    include <lsxintrin.h>
#  endif
#endif

#if defined(_MSC_VER)
#  include <intrin.h>
#endif

/*
 * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
 * remaining a true 64-bit/128-bit hash function.
 *
 * This is done by prioritizing a subset of 64-bit operations that can be
 * emulated without too many steps on the average 32-bit machine.
 *
 * For example, these two lines seem similar, and run equally fast on 64-bit:
 *
 *   xxh_u64 x;
 *   x ^= (x >> 47); // good
 *   x ^= (x >> 13); // bad
 *
 * However, to a 32-bit machine, there is a major difference.
 *
 * x ^= (x >> 47) looks like this:
 *
 *   x.lo ^= (x.hi >> (47 - 32));
 *
 * while x ^= (x >> 13) looks like this:
 *
 *   // note: funnel shifts are not usually cheap.
 *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
 *   x.hi ^= (x.hi >> 13);
 *
 * The first one is significantly faster than the second, simply because the
 * shift is larger than 32. This means:
 *  - All the bits we need are in the upper 32 bits, so we can ignore the lower
 *    32 bits in the shift.
 *  - The shift result will always fit in the lower 32 bits, and therefore,
 *    we can ignore the upper 32 bits in the xor.
 *
 * Thanks to this optimization, XXH3 only requires these features to be efficient:
 *
 *  - Usable unaligned access
 *  - A 32-bit or 64-bit ALU
 *      - If 32-bit, a decent ADC instruction
 *  - A 32 or 64-bit multiply with a 64-bit result
 *  - For the 128-bit variant, a decent byteswap helps short inputs.
 *
 * The first two are already required by XXH32, and almost all 32-bit and 64-bit
 * platforms which can run XXH32 can run XXH3 efficiently.
 *
 * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
 * notable exception.
 *
 * First of all, Thumb-1 lacks support for the UMULL instruction which
 * performs the important long multiply. This means numerous __aeabi_lmul
 * calls.
 *
 * Second of all, the 8 functional registers are just not enough.
 * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
 * Lo registers, and this shuffling results in thousands more MOVs than A32.
 *
 * A32 and T32 don't have this limitation. They can access all 14 registers,
 * do a 32->64 multiply with UMULL, and the flexible operand allowing free
 * shifts is helpful, too.
 *
 * Therefore, we do a quick sanity check.
 *
 * If compiling Thumb-1 for a target which supports ARM instructions, we will
 * emit a warning, as it is not a "sane" platform to compile for.
 *
 * Usually, if this happens, it is because of an accident and you probably need
 * to specify -march, as you likely meant to compile for a newer architecture.
 *
 * Credit: large sections of the vectorial and asm source code paths
 *         have been contributed by @easyaspi314
 */
#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
#endif

/* ==========================================
 * Vectorization detection
 * ========================================== */

#ifdef XXH_DOXYGEN
/*!
 * @ingroup tuning
 * @brief Overrides the vectorization implementation chosen for XXH3.
 *
 * Can be defined to 0 to disable SIMD or any of the values mentioned in
 * @ref XXH_VECTOR_TYPE.
 *
 * If this is not defined, it uses predefined macros to determine the best
 * implementation.
 */
#  define XXH_VECTOR XXH_SCALAR
/*!
 * @ingroup tuning
 * @brief Selects the minimum alignment for XXH3's accumulators.
 *
 * When using SIMD, this should match the alignment required for said vector
 * type, so, for example, 32 for AVX2.
 *
 * Default: Auto detected.
 */
#  define XXH_ACC_ALIGN 8
#endif

/* Actual definition */
#ifndef XXH_DOXYGEN
#endif

#ifndef XXH_VECTOR    /* can be defined on command line */
#  if defined(__ARM_FEATURE_SVE)
#    define XXH_VECTOR XXH_SVE
#  elif ( \
        defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
     || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
     || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* wasm simd128 via SIMDe */ \
   ) && ( \
        defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \
    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
   )
#    define XXH_VECTOR XXH_NEON
#  elif defined(__AVX512F__)
#    define XXH_VECTOR XXH_AVX512
#  elif defined(__AVX2__)
#    define XXH_VECTOR XXH_AVX2
#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
#    define XXH_VECTOR XXH_SSE2
#  elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
     || (defined(__s390x__) && defined(__VEC__)) \
     && defined(__GNUC__) /* TODO: IBM XL */
#    define XXH_VECTOR XXH_VSX
#  elif defined(__loongarch_sx)
#    define XXH_VECTOR XXH_LSX
#  else
#    define XXH_VECTOR XXH_SCALAR
#  endif
#endif

/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */
#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE)
#  ifdef _MSC_VER
#    pragma warning(once : 4606)
#  else
#    warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead."
#  endif
#  undef XXH_VECTOR
#  define XXH_VECTOR XXH_SCALAR
#endif

/*
 * Controls the alignment of the accumulator,
 * for compatibility with aligned vector loads, which are usually faster.
 */
#ifndef XXH_ACC_ALIGN
#  if defined(XXH_X86DISPATCH)
#     define XXH_ACC_ALIGN 64  /* for compatibility with avx512 */
#  elif XXH_VECTOR == XXH_SCALAR  /* scalar */
#     define XXH_ACC_ALIGN 8
#  elif XXH_VECTOR == XXH_SSE2  /* sse2 */
#     define XXH_ACC_ALIGN 16
#  elif XXH_VECTOR == XXH_AVX2  /* avx2 */
#     define XXH_ACC_ALIGN 32
#  elif XXH_VECTOR == XXH_NEON  /* neon */
#     define XXH_ACC_ALIGN 16
#  elif XXH_VECTOR == XXH_VSX   /* vsx */
#     define XXH_ACC_ALIGN 16
#  elif XXH_VECTOR == XXH_AVX512  /* avx512 */
#     define XXH_ACC_ALIGN 64
#  elif XXH_VECTOR == XXH_SVE   /* sve */
#     define XXH_ACC_ALIGN 64
#  elif XXH_VECTOR == XXH_LSX   /* lsx */
#     define XXH_ACC_ALIGN 64
#  endif
#endif

#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
    || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
#elif XXH_VECTOR == XXH_SVE
#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
#else
#  define XXH_SEC_ALIGN 8
#endif

#if defined(__GNUC__) || defined(__clang__)
#  define XXH_ALIASING __attribute__((__may_alias__))
#else
#  define XXH_ALIASING /* nothing */
#endif

/*
 * UGLY HACK:
 * GCC usually generates the best code with -O3 for xxHash.
 *
 * However, when targeting AVX2, it is overzealous in its unrolling resulting
 * in code roughly 3/4 the speed of Clang.
 *
 * There are other issues, such as GCC splitting _mm256_loadu_si256 into
 * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
 * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
 *
 * That is why when compiling the AVX2 version, it is recommended to use either
 *   -O2 -mavx2 -march=haswell
 * or
 *   -O2 -mavx2 -mno-avx256-split-unaligned-load
 * for decent performance, or to use Clang instead.
 *
 * Fortunately, we can control the first one with a pragma that forces GCC into
 * -O2, but the other one we can't control without "failed to inline always
 * inline function due to target mismatch" warnings.
 */
#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
  && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
#  pragma GCC push_options
#  pragma GCC optimize("-O2")
#endif

#if XXH_VECTOR == XXH_NEON

/*
 * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3
 * optimizes out the entire hashLong loop because of the aliasing violation.
 *
 * However, GCC is also inefficient at load-store optimization with vld1q/vst1q,
 * so the only option is to mark it as aliasing.
 */
typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;

/*!
 * @internal
 * @brief `vld1q_u64` but faster and alignment-safe.
 *
 * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only
 * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86).
 *
 * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it
 * prohibits load-store optimizations. Therefore, a direct dereference is used.
 *
 * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe
 * unaligned load.
 */
#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__)
XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */
{
    return *(xxh_aliasing_uint64x2_t const *)ptr;
}
#else
XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
{
    return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr));
}
#endif

/*!
 * @internal
 * @brief `vmlal_u32` on low and high halves of a vector.
 *
 * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with
 * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32`
 * with `vmlal_u32`.
 */
#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11
XXH_FORCE_INLINE uint64x2_t
XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
{
    /* Inline assembly is the only way */
    __asm__("umlal   %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs));
    return acc;
}
XXH_FORCE_INLINE uint64x2_t
XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
{
    /* This intrinsic works as expected */
    return vmlal_high_u32(acc, lhs, rhs);
}
#else
/* Portable intrinsic versions */
XXH_FORCE_INLINE uint64x2_t
XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
{
    return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs));
}
/*! @copydoc XXH_vmlal_low_u32
 * Assume the compiler converts this to vmlal_high_u32 on aarch64 */
XXH_FORCE_INLINE uint64x2_t
XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
{
    return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs));
}
#endif

/*!
 * @ingroup tuning
 * @brief Controls the NEON to scalar ratio for XXH3
 *
 * This can be set to 2, 4, 6, or 8.
 *
 * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used.
 *
 * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those
 * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU
 * bandwidth.
 *
 * This is even more noticeable on the more advanced cores like the Cortex-A76 which
 * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.
 *
 * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes
 * and 2 scalar lanes, which is chosen by default.
 *
 * This does not apply to Apple processors or 32-bit processors, which run better with
 * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes.
 *
 * This change benefits CPUs with large micro-op buffers without negatively affecting
 * most other CPUs:
 *
 *  | Chipset               | Dispatch type       | NEON only | 6:2 hybrid | Diff. |
 *  |:----------------------|:--------------------|----------:|-----------:|------:|
 *  | Snapdragon 730 (A76)  | 2 NEON/8 micro-ops  |  8.8 GB/s |  10.1 GB/s |  ~16% |
 *  | Snapdragon 835 (A73)  | 2 NEON/3 micro-ops  |  5.1 GB/s |   5.3 GB/s |   ~5% |
 *  | Marvell PXA1928 (A53) | In-order dual-issue |  1.9 GB/s |   1.9 GB/s |    0% |
 *  | Apple M1              | 4 NEON/8 micro-ops  | 37.3 GB/s |  36.1 GB/s |  ~-3% |
 *
 * It also seems to fix some bad codegen on GCC, making it almost as fast as clang.
 *
 * When using WASM SIMD128, if this is 2 or 6, SIMDe will scalarize 2 of the lanes meaning
 * it effectively becomes worse 4.
 *
 * @see XXH3_accumulate_512_neon()
 */
# ifndef XXH3_NEON_LANES
#  if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \
   && !defined(__APPLE__) && XXH_SIZE_OPT <= 0
#   define XXH3_NEON_LANES 6
#  else
#   define XXH3_NEON_LANES XXH_ACC_NB
#  endif
# endif
#endif  /* XXH_VECTOR == XXH_NEON */

/*
 * VSX and Z Vector helpers.
 *
 * This is very messy, and any pull requests to clean this up are welcome.
 *
 * There are a lot of problems with supporting VSX and s390x, due to
 * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
 */
#if XXH_VECTOR == XXH_VSX
/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`,
 * and `pixel`. This is a problem for obvious reasons.
 *
 * These keywords are unnecessary; the spec literally says they are
 * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd
 * after including the header.
 *
 * We use pragma push_macro/pop_macro to keep the namespace clean. */
#  pragma push_macro("bool")
#  pragma push_macro("vector")
#  pragma push_macro("pixel")
/* silence potential macro redefined warnings */
#  undef bool
#  undef vector
#  undef pixel

#  if defined(__s390x__)
#    include <s390intrin.h>
#  else
#    include <altivec.h>
#  endif

/* Restore the original macro values, if applicable. */
#  pragma pop_macro("pixel")
#  pragma pop_macro("vector")
#  pragma pop_macro("bool")

typedef __vector unsigned long long xxh_u64x2;
typedef __vector unsigned char xxh_u8x16;
typedef __vector unsigned xxh_u32x4;

/*
 * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue.
 */
typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING;

# ifndef XXH_VSX_BE
#  if defined(__BIG_ENDIAN__) \
  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
#    define XXH_VSX_BE 1
#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
#    warning "-maltivec=be is not recommended. Please use native endianness."
#    define XXH_VSX_BE 1
#  else
#    define XXH_VSX_BE 0
#  endif
# endif /* !defined(XXH_VSX_BE) */

# if XXH_VSX_BE
#  if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
#    define XXH_vec_revb vec_revb
#  else
/*!
 * A polyfill for POWER9's vec_revb().
 */
XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
{
    xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
                                  0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
    return vec_perm(val, val, vByteSwap);
}
#  endif
# endif /* XXH_VSX_BE */

/*!
 * Performs an unaligned vector load and byte swaps it on big endian.
 */
XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
{
    xxh_u64x2 ret;
    XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2));
# if XXH_VSX_BE
    ret = XXH_vec_revb(ret);
# endif
    return ret;
}

/*
 * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
 *
 * These intrinsics weren't added until GCC 8, despite existing for a while,
 * and they are endian dependent. Also, their meaning swap depending on version.
 * */
# if defined(__s390x__)
 /* s390x is always big endian, no issue on this platform */
#  define XXH_vec_mulo vec_mulo
#  define XXH_vec_mule vec_mule
# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__)
/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
 /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */
#  define XXH_vec_mulo __builtin_altivec_vmulouw
#  define XXH_vec_mule __builtin_altivec_vmuleuw
# else
/* gcc needs inline assembly */
/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
{
    xxh_u64x2 result;
    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
    return result;
}
XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
{
    xxh_u64x2 result;
    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
    return result;
}
# endif /* XXH_vec_mulo, XXH_vec_mule */
#endif /* XXH_VECTOR == XXH_VSX */

#if XXH_VECTOR == XXH_SVE
#define ACCRND(acc, offset) \
do { \
    svuint64_t input_vec = svld1_u64(mask, xinput + offset);         \
    svuint64_t secret_vec = svld1_u64(mask, xsecret + offset);       \
    svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec);     \
    svuint64_t swapped = svtbl_u64(input_vec, kSwap);                \
    svuint64_t mixed_lo = svextw_u64_x(mask, mixed);                 \
    svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32);            \
    svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \
    acc = svadd_u64_x(mask, acc, mul);                               \
} while (0)
#endif /* XXH_VECTOR == XXH_SVE */

/* prefetch
 * can be disabled, by declaring XXH_NO_PREFETCH build macro */
#if defined(XXH_NO_PREFETCH)
#  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
#else
#  if XXH_SIZE_OPT >= 1
#    define XXH_PREFETCH(ptr) (void)(ptr)
#  elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))  /* _mm_prefetch() not defined outside of x86/x64 */
#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
#    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
#    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
#  else
#    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
#  endif
#endif  /* XXH_NO_PREFETCH */


/* ==========================================
 * XXH3 default settings
 * ========================================== */

#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */

#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
#  error "default keyset is not large enough"
#endif

/*! Pseudorandom secret taken directly from FARSH. */
XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
};

static const xxh_u64 PRIME_MX1 = 0x165667919E3779F9ULL;  /*!< 0b0001011001010110011001111001000110011110001101110111100111111001 */
static const xxh_u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL;  /*!< 0b1001111110110010000111000110010100011110100110001101111100100101 */

#ifdef XXH_OLD_NAMES
#  define kSecret XXH3_kSecret
#endif

#ifdef XXH_DOXYGEN
/*!
 * @brief Calculates a 32-bit to 64-bit long multiply.
 *
 * Implemented as a macro.
 *
 * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't
 * need to (but it shouldn't need to anyways, it is about 7 instructions to do
 * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we
 * use that instead of the normal method.
 *
 * If you are compiling for platforms like Thumb-1 and don't have a better option,
 * you may also want to write your own long multiply routine here.
 *
 * @param x, y Numbers to be multiplied
 * @return 64-bit product of the low 32 bits of @p x and @p y.
 */
XXH_FORCE_INLINE xxh_u64
XXH_mult32to64(xxh_u64 x, xxh_u64 y)
{
   return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
}
#elif defined(_MSC_VER) && defined(_M_IX86)
#    define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
#else
/*
 * Downcast + upcast is usually better than masking on older compilers like
 * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
 *
 * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
 * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
 */
#    define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
#endif

/*!
 * @brief Calculates a 64->128-bit long multiply.
 *
 * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
 * version.
 *
 * @param lhs , rhs The 64-bit integers to be multiplied
 * @return The 128-bit result represented in an @ref XXH128_hash_t.
 */
static XXH128_hash_t
XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
{
    /*
     * GCC/Clang __uint128_t method.
     *
     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
     * This is usually the best way as it usually uses a native long 64-bit
     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
     *
     * Usually.
     *
     * Despite being a 32-bit platform, Clang (and emscripten) define this type
     * despite not having the arithmetic for it. This results in a laggy
     * compiler builtin call which calculates a full 128-bit multiply.
     * In that case it is best to use the portable one.
     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
     */
#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \
    && defined(__SIZEOF_INT128__) \
    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)

    __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
    XXH128_hash_t r128;
    r128.low64  = (xxh_u64)(product);
    r128.high64 = (xxh_u64)(product >> 64);
    return r128;

    /*
     * MSVC for x64's _umul128 method.
     *
     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
     *
     * This compiles to single operand MUL on x64.
     */
#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC)

#ifndef _MSC_VER
#   pragma intrinsic(_umul128)
#endif
    xxh_u64 product_high;
    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
    XXH128_hash_t r128;
    r128.low64  = product_low;
    r128.high64 = product_high;
    return r128;

    /*
     * MSVC for ARM64's __umulh method.
     *
     * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.
     */
#elif defined(_M_ARM64) || defined(_M_ARM64EC)

#ifndef _MSC_VER
#   pragma intrinsic(__umulh)
#endif
    XXH128_hash_t r128;
    r128.low64  = lhs * rhs;
    r128.high64 = __umulh(lhs, rhs);
    return r128;

#else
    /*
     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
     *
     * This is a fast and simple grade school multiply, which is shown below
     * with base 10 arithmetic instead of base 0x100000000.
     *
     *           9 3 // D2 lhs = 93
     *         x 7 5 // D2 rhs = 75
     *     ----------
     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
     *     ---------
     *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
     *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
     *     ---------
     *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
     *
     * The reasons for adding the products like this are:
     *  1. It avoids manual carry tracking. Just like how
     *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
     *     This avoids a lot of complexity.
     *
     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
     *     instruction available in ARM's Digital Signal Processing extension
     *     in 32-bit ARMv6 and later, which is shown below:
     *
     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
     *         {
     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
     *             *RdHi = (xxh_u32)(product >> 32);
     *         }
     *
     *     This instruction was designed for efficient long multiplication, and
     *     allows this to be calculated in only 4 instructions at speeds
     *     comparable to some 64-bit ALUs.
     *
     *  3. It isn't terrible on other platforms. Usually this will be a couple
     *     of 32-bit ADD/ADCs.
     */

    /* First calculate all of the cross products. */
    xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
    xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
    xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
    xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);

    /* Now add the products together. These will never overflow. */
    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);

    XXH128_hash_t r128;
    r128.low64  = lower;
    r128.high64 = upper;
    return r128;
#endif
}

/*!
 * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.
 *
 * The reason for the separate function is to prevent passing too many structs
 * around by value. This will hopefully inline the multiply, but we don't force it.
 *
 * @param lhs , rhs The 64-bit integers to multiply
 * @return The low 64 bits of the product XOR'd by the high 64 bits.
 * @see XXH_mult64to128()
 */
static xxh_u64
XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
{
    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
    return product.low64 ^ product.high64;
}

/*! Seems to produce slightly better code on GCC for some reason. */
XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
{
    XXH_ASSERT(0 <= shift && shift < 64);
    return v64 ^ (v64 >> shift);
}

/*
 * This is a fast avalanche stage,
 * suitable when input bits are already partially mixed
 */
static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
{
    h64 = XXH_xorshift64(h64, 37);
    h64 *= PRIME_MX1;
    h64 = XXH_xorshift64(h64, 32);
    return h64;
}

/*
 * This is a stronger avalanche,
 * inspired by Pelle Evensen's rrmxmx
 * preferable when input has not been previously mixed
 */
static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)
{
    /* this mix is inspired by Pelle Evensen's rrmxmx */
    h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
    h64 *= PRIME_MX2;
    h64 ^= (h64 >> 35) + len ;
    h64 *= PRIME_MX2;
    return XXH_xorshift64(h64, 28);
}


/* ==========================================
 * Short keys
 * ==========================================
 * One of the shortcomings of XXH32 and XXH64 was that their performance was
 * sub-optimal on short lengths. It used an iterative algorithm which strongly
 * favored lengths that were a multiple of 4 or 8.
 *
 * Instead of iterating over individual inputs, we use a set of single shot
 * functions which piece together a range of lengths and operate in constant time.
 *
 * Additionally, the number of multiplies has been significantly reduced. This
 * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
 *
 * Depending on the platform, this may or may not be faster than XXH32, but it
 * is almost guaranteed to be faster than XXH64.
 */

/*
 * At very short lengths, there isn't enough input to fully hide secrets, or use
 * the entire secret.
 *
 * There is also only a limited amount of mixing we can do before significantly
 * impacting performance.
 *
 * Therefore, we use different sections of the secret and always mix two secret
 * samples with an XOR. This should have no effect on performance on the
 * seedless or withSeed variants because everything _should_ be constant folded
 * by modern compilers.
 *
 * The XOR mixing hides individual parts of the secret and increases entropy.
 *
 * This adds an extra layer of strength for custom secrets.
 */
XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
{
    XXH_ASSERT(input != NULL);
    XXH_ASSERT(1 <= len && len <= 3);
    XXH_ASSERT(secret != NULL);
    /*
     * len = 1: combined = { input[0], 0x01, input[0], input[0] }
     * len = 2: combined = { input[1], 0x02, input[0], input[1] }
     * len = 3: combined = { input[2], 0x03, input[0], input[1] }
     */
    {   xxh_u8  const c1 = input[0];
        xxh_u8  const c2 = input[len >> 1];
        xxh_u8  const c3 = input[len - 1];
        xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2  << 24)
                               | ((xxh_u32)c3 <<  0) | ((xxh_u32)len << 8);
        xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
        xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
        return XXH64_avalanche(keyed);
    }
}

XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
{
    XXH_ASSERT(input != NULL);
    XXH_ASSERT(secret != NULL);
    XXH_ASSERT(4 <= len && len <= 8);
    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
    {   xxh_u32 const input1 = XXH_readLE32(input);
        xxh_u32 const input2 = XXH_readLE32(input + len - 4);
        xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
        xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
        xxh_u64 const keyed = input64 ^ bitflip;
        return XXH3_rrmxmx(keyed, len);
    }
}

XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
{
    XXH_ASSERT(input != NULL);
    XXH_ASSERT(secret != NULL);
    XXH_ASSERT(9 <= len && len <= 16);
    {   xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
        xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
        xxh_u64 const input_lo = XXH_readLE64(input)           ^ bitflip1;
        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
        xxh_u64 const acc = len
                          + XXH_swap64(input_lo) + input_hi
                          + XXH3_mul128_fold64(input_lo, input_hi);
        return XXH3_avalanche(acc);
    }
}

XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
{
    XXH_ASSERT(len <= 16);
    {   if (XXH_likely(len >  8)) return XXH3_len_9to16_64b(input, len, secret, seed);
        if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
        if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
        return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
    }
}

/*
 * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
 * multiplication by zero, affecting hashes of lengths 17 to 240.
 *
 * However, they are very unlikely.
 *
 * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
 * unseeded non-cryptographic hashes, it does not attempt to defend itself
 * against specially crafted inputs, only random inputs.
 *
 * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
 * cancelling out the secret is taken an arbitrary number of times (addressed
 * in XXH3_accumulate_512), this collision is very unlikely with random inputs
 * and/or proper seeding:
 *
 * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
 * function that is only called up to 16 times per hash with up to 240 bytes of
 * input.
 *
 * This is not too bad for a non-cryptographic hash function, especially with
 * only 64 bit outputs.
 *
 * The 128-bit variant (which trades some speed for strength) is NOT affected
 * by this, although it is always a good idea to use a proper seed if you care
 * about strength.
 */
XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
                                     const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
{
#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
  && defined(__i386__) && defined(__SSE2__)  /* x86 + SSE2 */ \
  && !defined(XXH_ENABLE_AUTOVECTORIZE)      /* Define to disable like XXH32 hack */
    /*
     * UGLY HACK:
     * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
     * slower code.
     *
     * By forcing seed64 into a register, we disrupt the cost model and
     * cause it to scalarize. See `XXH32_round()`
     *
     * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
     * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
     * GCC 9.2, despite both emitting scalar code.
     *
     * GCC generates much better scalar code than Clang for the rest of XXH3,
     * which is why finding a more optimal codepath is an interest.
     */
    XXH_COMPILER_GUARD(seed64);
#endif
    {   xxh_u64 const input_lo = XXH_readLE64(input);
        xxh_u64 const input_hi = XXH_readLE64(input+8);
        return XXH3_mul128_fold64(
            input_lo ^ (XXH_readLE64(secret)   + seed64),
            input_hi ^ (XXH_readLE64(secret+8) - seed64)
        );
    }
}

/* For mid range keys, XXH3 uses a Mum-hash variant. */
XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
                     XXH64_hash_t seed)
{
    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
    XXH_ASSERT(16 < len && len <= 128);

    {   xxh_u64 acc = len * XXH_PRIME64_1;
#if XXH_SIZE_OPT >= 1
        /* Smaller and cleaner, but slightly slower. */
        unsigned int i = (unsigned int)(len - 1) / 32;
        do {
            acc += XXH3_mix16B(input+16 * i, secret+32*i, seed);
            acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed);
        } while (i-- != 0);
#else
        if (len > 32) {
            if (len > 64) {
                if (len > 96) {
                    acc += XXH3_mix16B(input+48, secret+96, seed);
                    acc += XXH3_mix16B(input+len-64, secret+112, seed);
                }
                acc += XXH3_mix16B(input+32, secret+64, seed);
                acc += XXH3_mix16B(input+len-48, secret+80, seed);
            }
            acc += XXH3_mix16B(input+16, secret+32, seed);
            acc += XXH3_mix16B(input+len-32, secret+48, seed);
        }
        acc += XXH3_mix16B(input+0, secret+0, seed);
        acc += XXH3_mix16B(input+len-16, secret+16, seed);
#endif
        return XXH3_avalanche(acc);
    }
}

XXH_NO_INLINE XXH_PUREF XXH64_hash_t
XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
                      XXH64_hash_t seed)
{
    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);

    #define XXH3_MIDSIZE_STARTOFFSET 3
    #define XXH3_MIDSIZE_LASTOFFSET  17

    {   xxh_u64 acc = len * XXH_PRIME64_1;
        xxh_u64 acc_end;
        unsigned int const nbRounds = (unsigned int)len / 16;
        unsigned int i;
        XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
        for (i=0; i<8; i++) {
            acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
        }
        /* last bytes */
        acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
        XXH_ASSERT(nbRounds >= 8);
        acc = XXH3_avalanche(acc);
#if defined(__clang__)                                /* Clang */ \
    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
        /*
         * UGLY HACK:
         * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
         * In everywhere else, it uses scalar code.
         *
         * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
         * would still be slower than UMAAL (see XXH_mult64to128).
         *
         * Unfortunately, Clang doesn't handle the long multiplies properly and
         * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
         * scalarized into an ugly mess of VMOV.32 instructions.
         *
         * This mess is difficult to avoid without turning autovectorization
         * off completely, but they are usually relatively minor and/or not
         * worth it to fix.
         *
         * This loop is the easiest to fix, as unlike XXH32, this pragma
         * _actually works_ because it is a loop vectorization instead of an
         * SLP vectorization.
         */
        #pragma clang loop vectorize(disable)
#endif
        for (i=8 ; i < nbRounds; i++) {
            /*
             * Prevents clang for unrolling the acc loop and interleaving with this one.
             */
            XXH_COMPILER_GUARD(acc);
            acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
        }
        return XXH3_avalanche(acc + acc_end);
    }
}


/* =======     Long Keys     ======= */

#define XXH_STRIPE_LEN 64
#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))

#ifdef XXH_OLD_NAMES
#  define STRIPE_LEN XXH_STRIPE_LEN
#  define ACC_NB XXH_ACC_NB
#endif

#ifndef XXH_PREFETCH_DIST
#  ifdef __clang__
#    define XXH_PREFETCH_DIST 320
#  else
#    if (XXH_VECTOR == XXH_AVX512)
#      define XXH_PREFETCH_DIST 512
#    else
#      define XXH_PREFETCH_DIST 384
#    endif
#  endif  /* __clang__ */
#endif  /* XXH_PREFETCH_DIST */

/*
 * These macros are to generate an XXH3_accumulate() function.
 * The two arguments select the name suffix and target attribute.
 *
 * The name of this symbol is XXH3_accumulate_<name>() and it calls
 * XXH3_accumulate_512_<name>().
 *
 * It may be useful to hand implement this function if the compiler fails to
 * optimize the inline function.
 */
#define XXH3_ACCUMULATE_TEMPLATE(name)                      \
void                                                        \
XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc,           \
                       const xxh_u8* XXH_RESTRICT input,    \
                       const xxh_u8* XXH_RESTRICT secret,   \
                       size_t nbStripes)                    \
{                                                           \
    size_t n;                                               \
    for (n = 0; n < nbStripes; n++ ) {                      \
        const xxh_u8* const in = input + n*XXH_STRIPE_LEN;  \
        XXH_PREFETCH(in + XXH_PREFETCH_DIST);               \
        XXH3_accumulate_512_##name(                         \
                 acc,                                       \
                 in,                                        \
                 secret + n*XXH_SECRET_CONSUME_RATE);       \
    }                                                       \
}


XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
{
    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
    XXH_memcpy(dst, &v64, sizeof(v64));
}

/* Several intrinsic functions below are supposed to accept __int64 as argument,
 * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
 * However, several environments do not define __int64 type,
 * requiring a workaround.
 */
#if !defined (__VMS) \
  && (defined (__cplusplus) \
  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
    typedef int64_t xxh_i64;
#else
    /* the following type must have a width of 64-bit */
    typedef long long xxh_i64;
#endif


/*
 * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
 *
 * It is a hardened version of UMAC, based off of FARSH's implementation.
 *
 * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
 * implementations, and it is ridiculously fast.
 *
 * We harden it by mixing the original input to the accumulators as well as the product.
 *
 * This means that in the (relatively likely) case of a multiply by zero, the
 * original input is preserved.
 *
 * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
 * cross-pollination, as otherwise the upper and lower halves would be
 * essentially independent.
 *
 * This doesn't matter on 64-bit hashes since they all get merged together in
 * the end, so we skip the extra step.
 *
 * Both XXH3_64bits and XXH3_128bits use this subroutine.
 */

#if (XXH_VECTOR == XXH_AVX512) \
     || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)

#ifndef XXH_TARGET_AVX512
# define XXH_TARGET_AVX512  /* disable attribute target */
#endif

XXH_FORCE_INLINE XXH_TARGET_AVX512 void
XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
                     const void* XXH_RESTRICT input,
                     const void* XXH_RESTRICT secret)
{
    __m512i* const xacc = (__m512i *) acc;
    XXH_ASSERT((((size_t)acc) & 63) == 0);
    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));

    {
        /* data_vec    = input[0]; */
        __m512i const data_vec    = _mm512_loadu_si512   (input);
        /* key_vec     = secret[0]; */
        __m512i const key_vec     = _mm512_loadu_si512   (secret);
        /* data_key    = data_vec ^ key_vec; */
        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
        /* data_key_lo = data_key >> 32; */
        __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32);
        /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
        __m512i const product     = _mm512_mul_epu32     (data_key, data_key_lo);
        /* xacc[0] += swap(data_vec); */
        __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));
        __m512i const sum       = _mm512_add_epi64(*xacc, data_swap);
        /* xacc[0] += product; */
        *xacc = _mm512_add_epi64(product, sum);
    }
}
XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512)

/*
 * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
 *
 * Multiplication isn't perfect, as explained by Google in HighwayHash:
 *
 *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
 *  // varying degrees. In descending order of goodness, bytes
 *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
 *  // As expected, the upper and lower bytes are much worse.
 *
 * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
 *
 * Since our algorithm uses a pseudorandom secret to add some variance into the
 * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
 *
 * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
 * extraction.
 *
 * Both XXH3_64bits and XXH3_128bits use this subroutine.
 */

XXH_FORCE_INLINE XXH_TARGET_AVX512 void
XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
{
    XXH_ASSERT((((size_t)acc) & 63) == 0);
    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
    {   __m512i* const xacc = (__m512i*) acc;
        const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);

        /* xacc[0] ^= (xacc[0] >> 47) */
        __m512i const acc_vec     = *xacc;
        __m512i const shifted     = _mm512_srli_epi64    (acc_vec, 47);
        /* xacc[0] ^= secret; */
        __m512i const key_vec     = _mm512_loadu_si512   (secret);
        __m512i const data_key    = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */);

        /* xacc[0] *= XXH_PRIME32_1; */
        __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32);
        __m512i const prod_lo     = _mm512_mul_epu32     (data_key, prime32);
        __m512i const prod_hi     = _mm512_mul_epu32     (data_key_hi, prime32);
        *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
    }
}

XXH_FORCE_INLINE XXH_TARGET_AVX512 void
XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
{
    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
    XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
    XXH_ASSERT(((size_t)customSecret & 63) == 0);
    (void)(&XXH_writeLE64);
    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
        __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64);
        __m512i const seed     = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos);

        const __m512i* const src  = (const __m512i*) ((const void*) XXH3_kSecret);
              __m512i* const dest = (      __m512i*) customSecret;
        int i;
        XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
        XXH_ASSERT(((size_t)dest & 63) == 0);
        for (i=0; i < nbRounds; ++i) {
            dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed);
    }   }
}

#endif

#if (XXH_VECTOR == XXH_AVX2) \
    || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)

#ifndef XXH_TARGET_AVX2
# define XXH_TARGET_AVX2  /* disable attribute target */
#endif

XXH_FORCE_INLINE XXH_TARGET_AVX2 void
XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
                    const void* XXH_RESTRICT input,
                    const void* XXH_RESTRICT secret)
{
    XXH_ASSERT((((size_t)acc) & 31) == 0);
    {   __m256i* const xacc    =       (__m256i *) acc;
        /* Unaligned. This is mainly for pointer arithmetic, and because
         * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason. */
        const         __m256i* const xinput  = (const __m256i *) input;
        /* Unaligned. This is mainly for pointer arithmetic, and because
         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
        const         __m256i* const xsecret = (const __m256i *) secret;

        size_t i;
        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
            /* data_vec    = xinput[i]; */
            __m256i const data_vec    = _mm256_loadu_si256    (xinput+i);
            /* key_vec     = xsecret[i]; */
            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
            /* data_key    = data_vec ^ key_vec; */
            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
            /* data_key_lo = data_key >> 32; */
            __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32);
            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
            __m256i const product     = _mm256_mul_epu32     (data_key, data_key_lo);
            /* xacc[i] += swap(data_vec); */
            __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
            __m256i const sum       = _mm256_add_epi64(xacc[i], data_swap);
            /* xacc[i] += product; */
            xacc[i] = _mm256_add_epi64(product, sum);
    }   }
}
XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2)

XXH_FORCE_INLINE XXH_TARGET_AVX2 void
XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
{
    XXH_ASSERT((((size_t)acc) & 31) == 0);
    {   __m256i* const xacc = (__m256i*) acc;
        /* Unaligned. This is mainly for pointer arithmetic, and because
         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
        const         __m256i* const xsecret = (const __m256i *) secret;
        const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);

        size_t i;
        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
            /* xacc[i] ^= (xacc[i] >> 47) */
            __m256i const acc_vec     = xacc[i];
            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
            /* xacc[i] ^= xsecret; */
            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);

            /* xacc[i] *= XXH_PRIME32_1; */
            __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32);
            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
        }
    }
}

XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
{
    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
    XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
    (void)(&XXH_writeLE64);
    XXH_PREFETCH(customSecret);
    {   __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64);

        const __m256i* const src  = (const __m256i*) ((const void*) XXH3_kSecret);
              __m256i*       dest = (      __m256i*) customSecret;

#       if defined(__GNUC__) || defined(__clang__)
        /*
         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
         *   - do not extract the secret from sse registers in the internal loop
         *   - use less common registers, and avoid pushing these reg into stack
         */
        XXH_COMPILER_GUARD(dest);
#       endif
        XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */
        XXH_ASSERT(((size_t)dest & 31) == 0);

        /* GCC -O2 need unroll loop manually */
        dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed);
        dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed);
        dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed);
        dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed);
        dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed);
        dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed);
    }
}

#endif

/* x86dispatch always generates SSE2 */
#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)

#ifndef XXH_TARGET_SSE2
# define XXH_TARGET_SSE2  /* disable attribute target */
#endif

XXH_FORCE_INLINE XXH_TARGET_SSE2 void
XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
                    const void* XXH_RESTRICT input,
                    const void* XXH_RESTRICT secret)
{
    /* SSE2 is just a half-scale version of the AVX2 version. */
    XXH_ASSERT((((size_t)acc) & 15) == 0);
    {   __m128i* const xacc    =       (__m128i *) acc;
        /* Unaligned. This is mainly for pointer arithmetic, and because
         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
        const         __m128i* const xinput  = (const __m128i *) input;
        /* Unaligned. This is mainly for pointer arithmetic, and because
         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
        const         __m128i* const xsecret = (const __m128i *) secret;

        size_t i;
        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
            /* data_vec    = xinput[i]; */
            __m128i const data_vec    = _mm_loadu_si128   (xinput+i);
            /* key_vec     = xsecret[i]; */
            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
            /* data_key    = data_vec ^ key_vec; */
            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
            /* data_key_lo = data_key >> 32; */
            __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
            __m128i const product     = _mm_mul_epu32     (data_key, data_key_lo);
            /* xacc[i] += swap(data_vec); */
            __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
            __m128i const sum       = _mm_add_epi64(xacc[i], data_swap);
            /* xacc[i] += product; */
            xacc[i] = _mm_add_epi64(product, sum);
    }   }
}
XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2)

XXH_FORCE_INLINE XXH_TARGET_SSE2 void
XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
{
    XXH_ASSERT((((size_t)acc) & 15) == 0);
    {   __m128i* const xacc = (__m128i*) acc;
        /* Unaligned. This is mainly for pointer arithmetic, and because
         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
        const         __m128i* const xsecret = (const __m128i *) secret;
        const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);

        size_t i;
        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
            /* xacc[i] ^= (xacc[i] >> 47) */
            __m128i const acc_vec     = xacc[i];
            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
            /* xacc[i] ^= xsecret[i]; */
            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);

            /* xacc[i] *= XXH_PRIME32_1; */
            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
        }
    }
}

XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
{
    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
    (void)(&XXH_writeLE64);
    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);

#       if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
        /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */
        XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) };
        __m128i const seed = _mm_load_si128((__m128i const*)seed64x2);
#       else
        __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);
#       endif
        int i;

        const void* const src16 = XXH3_kSecret;
        __m128i* dst16 = (__m128i*) customSecret;
#       if defined(__GNUC__) || defined(__clang__)
        /*
         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
         *   - do not extract the secret from sse registers in the internal loop
         *   - use less common registers, and avoid pushing these reg into stack
         */
        XXH_COMPILER_GUARD(dst16);
#       endif
        XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */
        XXH_ASSERT(((size_t)dst16 & 15) == 0);

        for (i=0; i < nbRounds; ++i) {
            dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed);
    }   }
}

#endif

#if (XXH_VECTOR == XXH_NEON)

/* forward declarations for the scalar routines */
XXH_FORCE_INLINE void
XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input,
                 void const* XXH_RESTRICT secret, size_t lane);

XXH_FORCE_INLINE void
XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
                         void const* XXH_RESTRICT secret, size_t lane);

/*!
 * @internal
 * @brief The bulk processing loop for NEON and WASM SIMD128.
 *
 * The NEON code path is actually partially scalar when running on AArch64. This
 * is to optimize the pipelining and can have up to 15% speedup depending on the
 * CPU, and it also mitigates some GCC codegen issues.
 *
 * @see XXH3_NEON_LANES for configuring this and details about this optimization.
 *
 * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit
 * integers instead of the other platforms which mask full 64-bit vectors,
 * so the setup is more complicated than just shifting right.
 *
 * Additionally, there is an optimization for 4 lanes at once noted below.
 *
 * Since, as stated, the most optimal amount of lanes for Cortexes is 6,
 * there needs to be *three* versions of the accumulate operation used
 * for the remaining 2 lanes.
 *
 * WASM's SIMD128 uses SIMDe's arm_neon.h polyfill because the intrinsics overlap
 * nearly perfectly.
 */

XXH_FORCE_INLINE void
XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
                    const void* XXH_RESTRICT input,
                    const void* XXH_RESTRICT secret)
{
    XXH_ASSERT((((size_t)acc) & 15) == 0);
    XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);
    {   /* GCC for darwin arm64 does not like aliasing here */
        xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc;
        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
        uint8_t const* xinput = (const uint8_t *) input;
        uint8_t const* xsecret  = (const uint8_t *) secret;

        size_t i;
#ifdef __wasm_simd128__
        /*
         * On WASM SIMD128, Clang emits direct address loads when XXH3_kSecret
         * is constant propagated, which results in it converting it to this
         * inside the loop:
         *
         *    a = v128.load(XXH3_kSecret +  0 + $secret_offset, offset = 0)
         *    b = v128.load(XXH3_kSecret + 16 + $secret_offset, offset = 0)
         *    ...
         *
         * This requires a full 32-bit address immediate (and therefore a 6 byte
         * instruction) as well as an add for each offset.
         *
         * Putting an asm guard prevents it from folding (at the cost of losing
         * the alignment hint), and uses the free offset in `v128.load` instead
         * of adding secret_offset each time which overall reduces code size by
         * about a kilobyte and improves performance.
         */
        XXH_COMPILER_GUARD(xsecret);
#endif
        /* Scalar lanes use the normal scalarRound routine */
        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
            XXH3_scalarRound(acc, input, secret, i);
        }
        i = 0;
        /* 4 NEON lanes at a time. */
        for (; i+1 < XXH3_NEON_LANES / 2; i+=2) {
            /* data_vec = xinput[i]; */
            uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput  + (i * 16));
            uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput  + ((i+1) * 16));
            /* key_vec  = xsecret[i];  */
            uint64x2_t key_vec_1  = XXH_vld1q_u64(xsecret + (i * 16));
            uint64x2_t key_vec_2  = XXH_vld1q_u64(xsecret + ((i+1) * 16));
            /* data_swap = swap(data_vec) */
            uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
            uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);
            /* data_key = data_vec ^ key_vec; */
            uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
            uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);

            /*
             * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
             * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
             * get one vector with the low 32 bits of each lane, and one vector
             * with the high 32 bits of each lane.
             *
             * The intrinsic returns a double vector because the original ARMv7-a
             * instruction modified both arguments in place. AArch64 and SIMD128 emit
             * two instructions from this intrinsic.
             *
             *  [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
             *  [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
             */
            uint32x4x2_t unzipped = vuzpq_u32(
                vreinterpretq_u32_u64(data_key_1),
                vreinterpretq_u32_u64(data_key_2)
            );
            /* data_key_lo = data_key & 0xFFFFFFFF */
            uint32x4_t data_key_lo = unzipped.val[0];
            /* data_key_hi = data_key >> 32 */
            uint32x4_t data_key_hi = unzipped.val[1];
            /*
             * Then, we can split the vectors horizontally and multiply which, as for most
             * widening intrinsics, have a variant that works on both high half vectors
             * for free on AArch64. A similar instruction is available on SIMD128.
             *
             * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
             */
            uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi);
            uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi);
            /*
             * Clang reorders
             *    a += b * c;     // umlal   swap.2d, dkl.2s, dkh.2s
             *    c += a;         // add     acc.2d, acc.2d, swap.2d
             * to
             *    c += a;         // add     acc.2d, acc.2d, swap.2d
             *    c += b * c;     // umlal   acc.2d, dkl.2s, dkh.2s
             *
             * While it would make sense in theory since the addition is faster,
             * for reasons likely related to umlal being limited to certain NEON
             * pipelines, this is worse. A compiler guard fixes this.
             */
            XXH_COMPILER_GUARD_CLANG_NEON(sum_1);
            XXH_COMPILER_GUARD_CLANG_NEON(sum_2);
            /* xacc[i] = acc_vec + sum; */
            xacc[i]   = vaddq_u64(xacc[i], sum_1);
            xacc[i+1] = vaddq_u64(xacc[i+1], sum_2);
        }
        /* Operate on the remaining NEON lanes 2 at a time. */
        for (; i < XXH3_NEON_LANES / 2; i++) {
            /* data_vec = xinput[i]; */
            uint64x2_t data_vec = XXH_vld1q_u64(xinput  + (i * 16));
            /* key_vec  = xsecret[i];  */
            uint64x2_t key_vec  = XXH_vld1q_u64(xsecret + (i * 16));
            /* acc_vec_2 = swap(data_vec) */
            uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1);
            /* data_key = data_vec ^ key_vec; */
            uint64x2_t data_key = veorq_u64(data_vec, key_vec);
            /* For two lanes, just use VMOVN and VSHRN. */
            /* data_key_lo = data_key & 0xFFFFFFFF; */
            uint32x2_t data_key_lo = vmovn_u64(data_key);
            /* data_key_hi = data_key >> 32; */
            uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32);
            /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */
            uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi);
            /* Same Clang workaround as before */
            XXH_COMPILER_GUARD_CLANG_NEON(sum);
            /* xacc[i] = acc_vec + sum; */
            xacc[i] = vaddq_u64 (xacc[i], sum);
        }
    }
}
XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon)

XXH_FORCE_INLINE void
XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
{
    XXH_ASSERT((((size_t)acc) & 15) == 0);

    {   xxh_aliasing_uint64x2_t* xacc       = (xxh_aliasing_uint64x2_t*) acc;
        uint8_t const* xsecret = (uint8_t const*) secret;

        size_t i;
        /* WASM uses operator overloads and doesn't need these. */
#ifndef __wasm_simd128__
        /* { prime32_1, prime32_1 } */
        uint32x2_t const kPrimeLo = vdup_n_u32(XXH_PRIME32_1);
        /* { 0, prime32_1, 0, prime32_1 } */
        uint32x4_t const kPrimeHi = vreinterpretq_u32_u64(vdupq_n_u64((xxh_u64)XXH_PRIME32_1 << 32));
#endif

        /* AArch64 uses both scalar and neon at the same time */
        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
            XXH3_scalarScrambleRound(acc, secret, i);
        }
        for (i=0; i < XXH3_NEON_LANES / 2; i++) {
            /* xacc[i] ^= (xacc[i] >> 47); */
            uint64x2_t acc_vec  = xacc[i];
            uint64x2_t shifted  = vshrq_n_u64(acc_vec, 47);
            uint64x2_t data_vec = veorq_u64(acc_vec, shifted);

            /* xacc[i] ^= xsecret[i]; */
            uint64x2_t key_vec  = XXH_vld1q_u64(xsecret + (i * 16));
            uint64x2_t data_key = veorq_u64(data_vec, key_vec);
            /* xacc[i] *= XXH_PRIME32_1 */
#ifdef __wasm_simd128__
            /* SIMD128 has multiply by u64x2, use it instead of expanding and scalarizing */
            xacc[i] = data_key * XXH_PRIME32_1;
#else
            /*
             * Expanded version with portable NEON intrinsics
             *
             *    lo(x) * lo(y) + (hi(x) * lo(y) << 32)
             *
             * prod_hi = hi(data_key) * lo(prime) << 32
             *
             * Since we only need 32 bits of this multiply a trick can be used, reinterpreting the vector
             * as a uint32x4_t and multiplying by { 0, prime, 0, prime } to cancel out the unwanted bits
             * and avoid the shift.
             */
            uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64(data_key), kPrimeHi);
            /* Extract low bits for vmlal_u32  */
            uint32x2_t data_key_lo = vmovn_u64(data_key);
            /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */
            xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo);
#endif
        }
    }
}
#endif

#if (XXH_VECTOR == XXH_VSX)

XXH_FORCE_INLINE void
XXH3_accumulate_512_vsx(  void* XXH_RESTRICT acc,
                    const void* XXH_RESTRICT input,
                    const void* XXH_RESTRICT secret)
{
    /* presumed aligned */
    xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
    xxh_u8 const* const xinput   = (xxh_u8 const*) input;   /* no alignment restriction */
    xxh_u8 const* const xsecret  = (xxh_u8 const*) secret;    /* no alignment restriction */
    xxh_u64x2 const v32 = { 32, 32 };
    size_t i;
    for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
        /* data_vec = xinput[i]; */
        xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i);
        /* key_vec = xsecret[i]; */
        xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + 16*i);
        xxh_u64x2 const data_key = data_vec ^ key_vec;
        /* shuffled = (data_key << 32) | (data_key >> 32); */
        xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
        /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
        xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
        /* acc_vec = xacc[i]; */
        xxh_u64x2 acc_vec        = xacc[i];
        acc_vec += product;

        /* swap high and low halves */
#ifdef __s390x__
        acc_vec += vec_permi(data_vec, data_vec, 2);
#else
        acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
#endif
        xacc[i] = acc_vec;
    }
}
XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx)

XXH_FORCE_INLINE void
XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
{
    XXH_ASSERT((((size_t)acc) & 15) == 0);

    {   xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
        const xxh_u8* const xsecret = (const xxh_u8*) secret;
        /* constants */
        xxh_u64x2 const v32  = { 32, 32 };
        xxh_u64x2 const v47 = { 47, 47 };
        xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 };
        size_t i;
        for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
            /* xacc[i] ^= (xacc[i] >> 47); */
            xxh_u64x2 const acc_vec  = xacc[i];
            xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);

            /* xacc[i] ^= xsecret[i]; */
            xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + 16*i);
            xxh_u64x2 const data_key = data_vec ^ key_vec;

            /* xacc[i] *= XXH_PRIME32_1 */
            /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF);  */
            xxh_u64x2 const prod_even  = XXH_vec_mule((xxh_u32x4)data_key, prime);
            /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
            xxh_u64x2 const prod_odd  = XXH_vec_mulo((xxh_u32x4)data_key, prime);
            xacc[i] = prod_odd + (prod_even << v32);
    }   }
}

#endif

#if (XXH_VECTOR == XXH_SVE)

XXH_FORCE_INLINE void
XXH3_accumulate_512_sve( void* XXH_RESTRICT acc,
                   const void* XXH_RESTRICT input,
                   const void* XXH_RESTRICT secret)
{
    uint64_t *xacc = (uint64_t *)acc;
    const uint64_t *xinput = (const uint64_t *)(const void *)input;
    const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
    svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
    uint64_t element_count = svcntd();
    if (element_count >= 8) {
        svbool_t mask = svptrue_pat_b64(SV_VL8);
        svuint64_t vacc = svld1_u64(mask, xacc);
        ACCRND(vacc, 0);
        svst1_u64(mask, xacc, vacc);
    } else if (element_count == 2) {   /* sve128 */
        svbool_t mask = svptrue_pat_b64(SV_VL2);
        svuint64_t acc0 = svld1_u64(mask, xacc + 0);
        svuint64_t acc1 = svld1_u64(mask, xacc + 2);
        svuint64_t acc2 = svld1_u64(mask, xacc + 4);
        svuint64_t acc3 = svld1_u64(mask, xacc + 6);
        ACCRND(acc0, 0);
        ACCRND(acc1, 2);
        ACCRND(acc2, 4);
        ACCRND(acc3, 6);
        svst1_u64(mask, xacc + 0, acc0);
        svst1_u64(mask, xacc + 2, acc1);
        svst1_u64(mask, xacc + 4, acc2);
        svst1_u64(mask, xacc + 6, acc3);
    } else {
        svbool_t mask = svptrue_pat_b64(SV_VL4);
        svuint64_t acc0 = svld1_u64(mask, xacc + 0);
        svuint64_t acc1 = svld1_u64(mask, xacc + 4);
        ACCRND(acc0, 0);
        ACCRND(acc1, 4);
        svst1_u64(mask, xacc + 0, acc0);
        svst1_u64(mask, xacc + 4, acc1);
    }
}

XXH_FORCE_INLINE void
XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,
               const xxh_u8* XXH_RESTRICT input,
               const xxh_u8* XXH_RESTRICT secret,
               size_t nbStripes)
{
    if (nbStripes != 0) {
        uint64_t *xacc = (uint64_t *)acc;
        const uint64_t *xinput = (const uint64_t *)(const void *)input;
        const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
        svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
        uint64_t element_count = svcntd();
        if (element_count >= 8) {
            svbool_t mask = svptrue_pat_b64(SV_VL8);
            svuint64_t vacc = svld1_u64(mask, xacc + 0);
            do {
                /* svprfd(svbool_t, void *, enum svfprop); */
                svprfd(mask, xinput + 128, SV_PLDL1STRM);
                ACCRND(vacc, 0);
                xinput += 8;
                xsecret += 1;
                nbStripes--;
           } while (nbStripes != 0);

           svst1_u64(mask, xacc + 0, vacc);
        } else if (element_count == 2) { /* sve128 */
            svbool_t mask = svptrue_pat_b64(SV_VL2);
            svuint64_t acc0 = svld1_u64(mask, xacc + 0);
            svuint64_t acc1 = svld1_u64(mask, xacc + 2);
            svuint64_t acc2 = svld1_u64(mask, xacc + 4);
            svuint64_t acc3 = svld1_u64(mask, xacc + 6);
            do {
                svprfd(mask, xinput + 128, SV_PLDL1STRM);
                ACCRND(acc0, 0);
                ACCRND(acc1, 2);
                ACCRND(acc2, 4);
                ACCRND(acc3, 6);
                xinput += 8;
                xsecret += 1;
                nbStripes--;
           } while (nbStripes != 0);

           svst1_u64(mask, xacc + 0, acc0);
           svst1_u64(mask, xacc + 2, acc1);
           svst1_u64(mask, xacc + 4, acc2);
           svst1_u64(mask, xacc + 6, acc3);
        } else {
            svbool_t mask = svptrue_pat_b64(SV_VL4);
            svuint64_t acc0 = svld1_u64(mask, xacc + 0);
            svuint64_t acc1 = svld1_u64(mask, xacc + 4);
            do {
                svprfd(mask, xinput + 128, SV_PLDL1STRM);
                ACCRND(acc0, 0);
                ACCRND(acc1, 4);
                xinput += 8;
                xsecret += 1;
                nbStripes--;
           } while (nbStripes != 0);

           svst1_u64(mask, xacc + 0, acc0);
           svst1_u64(mask, xacc + 4, acc1);
       }
    }
}

#endif

#if (XXH_VECTOR == XXH_LSX)
#define _LSX_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))

XXH_FORCE_INLINE void
XXH3_accumulate_512_lsx( void* XXH_RESTRICT acc,
                    const void* XXH_RESTRICT input,
                    const void* XXH_RESTRICT secret)
{
    XXH_ASSERT((((size_t)acc) & 15) == 0);
    {
        __m128i* const xacc    =       (__m128i *) acc;
        const __m128i* const xinput  = (const __m128i *) input;
        const __m128i* const xsecret = (const __m128i *) secret;

        for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
            /* data_vec = xinput[i]; */
            __m128i const data_vec = __lsx_vld(xinput + i, 0);
            /* key_vec = xsecret[i]; */
            __m128i const key_vec = __lsx_vld(xsecret + i, 0);
            /* data_key = data_vec ^ key_vec; */
            __m128i const data_key = __lsx_vxor_v(data_vec, key_vec);
            /* data_key_lo = data_key >> 32; */
            __m128i const data_key_lo = __lsx_vsrli_d(data_key, 32);
            // __m128i const data_key_lo = __lsx_vsrli_d(data_key, 32);
            /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
            __m128i const product = __lsx_vmulwev_d_wu(data_key, data_key_lo);
            /* xacc[i] += swap(data_vec); */
            __m128i const data_swap = __lsx_vshuf4i_w(data_vec, _LSX_SHUFFLE(1, 0, 3, 2));
            __m128i const sum = __lsx_vadd_d(xacc[i], data_swap);
            /* xacc[i] += product; */
            xacc[i] = __lsx_vadd_d(product, sum);
        }
    }
}
XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(lsx)

XXH_FORCE_INLINE void
XXH3_scrambleAcc_lsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
{
    XXH_ASSERT((((size_t)acc) & 15) == 0);
    {
        __m128i* const xacc = (__m128i*) acc;
        const __m128i* const xsecret = (const __m128i *) secret;
        const __m128i prime32 = __lsx_vreplgr2vr_w((int)XXH_PRIME32_1);

        for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
            /* xacc[i] ^= (xacc[i] >> 47) */
            __m128i const acc_vec = xacc[i];
            __m128i const shifted = __lsx_vsrli_d(acc_vec, 47);
            __m128i const data_vec = __lsx_vxor_v(acc_vec, shifted);
            /* xacc[i] ^= xsecret[i]; */
            __m128i const key_vec = __lsx_vld(xsecret + i, 0);
            __m128i const data_key = __lsx_vxor_v(data_vec, key_vec);

            /* xacc[i] *= XXH_PRIME32_1; */
            __m128i const data_key_hi = __lsx_vsrli_d(data_key, 32);
            __m128i const prod_lo = __lsx_vmulwev_d_wu(data_key, prime32);
            __m128i const prod_hi = __lsx_vmulwev_d_wu(data_key_hi, prime32);
            xacc[i] = __lsx_vadd_d(prod_lo, __lsx_vslli_d(prod_hi, 32));
        }
    }
}

#endif

/* scalar variants - universal */

#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
/*
 * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they
 * emit an excess mask and a full 64-bit multiply-add (MADD X-form).
 *
 * While this might not seem like much, as AArch64 is a 64-bit architecture, only
 * big Cortex designs have a full 64-bit multiplier.
 *
 * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit
 * multiplies expand to 2-3 multiplies in microcode. This has a major penalty
 * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline.
 *
 * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does
 * not have this penalty and does the mask automatically.
 */
XXH_FORCE_INLINE xxh_u64
XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
{
    xxh_u64 ret;
    /* note: %x = 64-bit register, %w = 32-bit register */
    __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc));
    return ret;
}
#else
XXH_FORCE_INLINE xxh_u64
XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
{
    return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc;
}
#endif

/*!
 * @internal
 * @brief Scalar round for @ref XXH3_accumulate_512_scalar().
 *
 * This is extracted to its own function because the NEON path uses a combination
 * of NEON and scalar.
 */
XXH_FORCE_INLINE void
XXH3_scalarRound(void* XXH_RESTRICT acc,
                 void const* XXH_RESTRICT input,
                 void const* XXH_RESTRICT secret,
                 size_t lane)
{
    xxh_u64* xacc = (xxh_u64*) acc;
    xxh_u8 const* xinput  = (xxh_u8 const*) input;
    xxh_u8 const* xsecret = (xxh_u8 const*) secret;
    XXH_ASSERT(lane < XXH_ACC_NB);
    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
    {
        xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);
        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);
        xacc[lane ^ 1] += data_val; /* swap adjacent lanes */
        xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]);
    }
}

/*!
 * @internal
 * @brief Processes a 64 byte block of data using the scalar path.
 */
XXH_FORCE_INLINE void
XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
                     const void* XXH_RESTRICT input,
                     const void* XXH_RESTRICT secret)
{
    size_t i;
    /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */
#if defined(__GNUC__) && !defined(__clang__) \
  && (defined(__arm__) || defined(__thumb2__)) \
  && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \
  && XXH_SIZE_OPT <= 0
#  pragma GCC unroll 8
#endif
    for (i=0; i < XXH_ACC_NB; i++) {
        XXH3_scalarRound(acc, input, secret, i);
    }
}
XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar)

/*!
 * @internal
 * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar().
 *
 * This is extracted to its own function because the NEON path uses a combination
 * of NEON and scalar.
 */
XXH_FORCE_INLINE void
XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
                         void const* XXH_RESTRICT secret,
                         size_t lane)
{
    xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
    XXH_ASSERT(lane < XXH_ACC_NB);
    {
        xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8);
        xxh_u64 acc64 = xacc[lane];
        acc64 = XXH_xorshift64(acc64, 47);
        acc64 ^= key64;
        acc64 *= XXH_PRIME32_1;
        xacc[lane] = acc64;
    }
}

/*!
 * @internal
 * @brief Scrambles the accumulators after a large chunk has been read
 */
XXH_FORCE_INLINE void
XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
{
    size_t i;
    for (i=0; i < XXH_ACC_NB; i++) {
        XXH3_scalarScrambleRound(acc, secret, i);
    }
}

XXH_FORCE_INLINE void
XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
{
    /*
     * We need a separate pointer for the hack below,
     * which requires a non-const pointer.
     * Any decent compiler will optimize this out otherwise.
     */
    const xxh_u8* kSecretPtr = XXH3_kSecret;
    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);

#if defined(__GNUC__) && defined(__aarch64__)
    /*
     * UGLY HACK:
     * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are
     * placed sequentially, in order, at the top of the unrolled loop.
     *
     * While MOVK is great for generating constants (2 cycles for a 64-bit
     * constant compared to 4 cycles for LDR), it fights for bandwidth with
     * the arithmetic instructions.
     *
     *   I   L   S
     * MOVK
     * MOVK
     * MOVK
     * MOVK
     * ADD
     * SUB      STR
     *          STR
     * By forcing loads from memory (as the asm line causes the compiler to assume
     * that XXH3_kSecretPtr has been changed), the pipelines are used more
     * efficiently:
     *   I   L   S
     *      LDR
     *  ADD LDR
     *  SUB     STR
     *          STR
     *
     * See XXH3_NEON_LANES for details on the pipsline.
     *
     * XXH3_64bits_withSeed, len == 256, Snapdragon 835
     *   without hack: 2654.4 MB/s
     *   with hack:    3202.9 MB/s
     */
    XXH_COMPILER_GUARD(kSecretPtr);
#endif
    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
        int i;
        for (i=0; i < nbRounds; i++) {
            /*
             * The asm hack causes the compiler to assume that kSecretPtr aliases with
             * customSecret, and on aarch64, this prevented LDP from merging two
             * loads together for free. Putting the loads together before the stores
             * properly generates LDP.
             */
            xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i)     + seed64;
            xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
            XXH_writeLE64((xxh_u8*)customSecret + 16*i,     lo);
            XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi);
    }   }
}


typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t);
typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);
typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);


#if (XXH_VECTOR == XXH_AVX512)

#define XXH3_accumulate_512 XXH3_accumulate_512_avx512
#define XXH3_accumulate     XXH3_accumulate_avx512
#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx512
#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512

#elif (XXH_VECTOR == XXH_AVX2)

#define XXH3_accumulate_512 XXH3_accumulate_512_avx2
#define XXH3_accumulate     XXH3_accumulate_avx2
#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx2
#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2

#elif (XXH_VECTOR == XXH_SSE2)

#define XXH3_accumulate_512 XXH3_accumulate_512_sse2
#define XXH3_accumulate     XXH3_accumulate_sse2
#define XXH3_scrambleAcc    XXH3_scrambleAcc_sse2
#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2

#elif (XXH_VECTOR == XXH_NEON)

#define XXH3_accumulate_512 XXH3_accumulate_512_neon
#define XXH3_accumulate     XXH3_accumulate_neon
#define XXH3_scrambleAcc    XXH3_scrambleAcc_neon
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar

#elif (XXH_VECTOR == XXH_VSX)

#define XXH3_accumulate_512 XXH3_accumulate_512_vsx
#define XXH3_accumulate     XXH3_accumulate_vsx
#define XXH3_scrambleAcc    XXH3_scrambleAcc_vsx
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar

#elif (XXH_VECTOR == XXH_SVE)
#define XXH3_accumulate_512 XXH3_accumulate_512_sve
#define XXH3_accumulate     XXH3_accumulate_sve
#define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar

#elif (XXH_VECTOR == XXH_LSX)
#define XXH3_accumulate_512 XXH3_accumulate_512_lsx
#define XXH3_accumulate     XXH3_accumulate_lsx
#define XXH3_scrambleAcc    XXH3_scrambleAcc_lsx
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar

#else /* scalar */

#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
#define XXH3_accumulate     XXH3_accumulate_scalar
#define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar

#endif

#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */
#  undef XXH3_initCustomSecret
#  define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
#endif

XXH_FORCE_INLINE void
XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
                      const xxh_u8* XXH_RESTRICT input, size_t len,
                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
                            XXH3_f_accumulate f_acc,
                            XXH3_f_scrambleAcc f_scramble)
{
    size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
    size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
    size_t const nb_blocks = (len - 1) / block_len;

    size_t n;

    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);

    for (n = 0; n < nb_blocks; n++) {
        f_acc(acc, input + n*block_len, secret, nbStripesPerBlock);
        f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
    }

    /* last partial block */
    XXH_ASSERT(len > XXH_STRIPE_LEN);
    {   size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
        f_acc(acc, input + nb_blocks*block_len, secret, nbStripes);

        /* last stripe */
        {   const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
#define XXH_SECRET_LASTACC_START 7  /* not aligned on 8, last secret is different from acc & scrambler */
            XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
    }   }
}

XXH_FORCE_INLINE xxh_u64
XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
{
    return XXH3_mul128_fold64(
               acc[0] ^ XXH_readLE64(secret),
               acc[1] ^ XXH_readLE64(secret+8) );
}

static XXH_PUREF XXH64_hash_t
XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
{
    xxh_u64 result64 = start;
    size_t i = 0;

    for (i = 0; i < 4; i++) {
        result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);
#if defined(__clang__)                                /* Clang */ \
    && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
        /*
         * UGLY HACK:
         * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
         * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
         * XXH3_64bits, len == 256, Snapdragon 835:
         *   without hack: 2063.7 MB/s
         *   with hack:    2560.7 MB/s
         */
        XXH_COMPILER_GUARD(result64);
#endif
    }

    return XXH3_avalanche(result64);
}

/* do not align on 8, so that the secret is different from the accumulator */
#define XXH_SECRET_MERGEACCS_START 11

static XXH_PUREF XXH64_hash_t
XXH3_finalizeLong_64b(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 len)
{
    return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, len * XXH_PRIME64_1);
}

#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
                        XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }

XXH_FORCE_INLINE XXH64_hash_t
XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
                           const void* XXH_RESTRICT secret, size_t secretSize,
                           XXH3_f_accumulate f_acc,
                           XXH3_f_scrambleAcc f_scramble)
{
    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;

    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble);

    /* converge into final hash */
    XXH_STATIC_ASSERT(sizeof(acc) == 64);
    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
    return XXH3_finalizeLong_64b(acc, (const xxh_u8*)secret, (xxh_u64)len);
}

/*
 * It's important for performance to transmit secret's size (when it's static)
 * so that the compiler can properly optimize the vectorized loop.
 * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
 * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
 * breaks -Og, this is XXH_NO_INLINE.
 */
XXH3_WITH_SECRET_INLINE XXH64_hash_t
XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
                             XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
{
    (void)seed64;
    return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc);
}

/*
 * It's preferable for performance that XXH3_hashLong is not inlined,
 * as it results in a smaller function for small data, easier to the instruction cache.
 * Note that inside this no_inline function, we do inline the internal loop,
 * and provide a statically defined secret size to allow optimization of vector loop.
 */
XXH_NO_INLINE XXH_PUREF XXH64_hash_t
XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
                          XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
{
    (void)seed64; (void)secret; (void)secretLen;
    return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc);
}

/*
 * XXH3_hashLong_64b_withSeed():
 * Generate a custom key based on alteration of default XXH3_kSecret with the seed,
 * and then use this key for long mode hashing.
 *
 * This operation is decently fast but nonetheless costs a little bit of time.
 * Try to avoid it whenever possible (typically when seed==0).
 *
 * It's important for performance that XXH3_hashLong is not inlined. Not sure
 * why (uop cache maybe?), but the difference is large and easily measurable.
 */
XXH_FORCE_INLINE XXH64_hash_t
XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
                                    XXH64_hash_t seed,
                                    XXH3_f_accumulate f_acc,
                                    XXH3_f_scrambleAcc f_scramble,
                                    XXH3_f_initCustomSecret f_initSec)
{
#if XXH_SIZE_OPT <= 0
    if (seed == 0)
        return XXH3_hashLong_64b_internal(input, len,
                                          XXH3_kSecret, sizeof(XXH3_kSecret),
                                          f_acc, f_scramble);
#endif
    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
        f_initSec(secret, seed);
        return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
                                          f_acc, f_scramble);
    }
}

/*
 * It's important for performance that XXH3_hashLong is not inlined.
 */
XXH_NO_INLINE XXH64_hash_t
XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len,
                           XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
{
    (void)secret; (void)secretLen;
    return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
                XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
}


typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t,
                                          XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t);

XXH_FORCE_INLINE XXH64_hash_t
XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
                     XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
                     XXH3_hashLong64_f f_hashLong)
{
    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
    /*
     * If an action is to be taken if `secretLen` condition is not respected,
     * it should be done here.
     * For now, it's a contract pre-condition.
     * Adding a check and a branch here would cost performance at every hash.
     * Also, note that function signature doesn't offer room to return an error.
     */
    if (len <= 16)
        return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
    if (len <= 128)
        return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
    if (len <= XXH3_MIDSIZE_MAX)
        return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
    return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);
}


/* ===   Public entry point   === */

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length)
{
    return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH64_hash_t
XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize)
{
    return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH64_hash_t
XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed)
{
    return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
}

XXH_PUBLIC_API XXH64_hash_t
XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
{
    if (length <= XXH3_MIDSIZE_MAX)
        return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
    return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize);
}


/* ===   XXH3 streaming   === */
#ifndef XXH_NO_STREAM
/*
 * Malloc's a pointer that is always aligned to @align.
 *
 * This must be freed with `XXH_alignedFree()`.
 *
 * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
 * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
 * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
 *
 * This underalignment previously caused a rather obvious crash which went
 * completely unnoticed due to XXH3_createState() not actually being tested.
 * Credit to RedSpah for noticing this bug.
 *
 * The alignment is done manually: Functions like posix_memalign or _mm_malloc
 * are avoided: To maintain portability, we would have to write a fallback
 * like this anyways, and besides, testing for the existence of library
 * functions without relying on external build tools is impossible.
 *
 * The method is simple: Overallocate, manually align, and store the offset
 * to the original behind the returned pointer.
 *
 * Align must be a power of 2 and 8 <= align <= 128.
 */
static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align)
{
    XXH_ASSERT(align <= 128 && align >= 8); /* range check */
    XXH_ASSERT((align & (align-1)) == 0);   /* power of 2 */
    XXH_ASSERT(s != 0 && s < (s + align));  /* empty/overflow */
    {   /* Overallocate to make room for manual realignment and an offset byte */
        xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
        if (base != NULL) {
            /*
             * Get the offset needed to align this pointer.
             *
             * Even if the returned pointer is aligned, there will always be
             * at least one byte to store the offset to the original pointer.
             */
            size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
            /* Add the offset for the now-aligned pointer */
            xxh_u8* ptr = base + offset;

            XXH_ASSERT((size_t)ptr % align == 0);

            /* Store the offset immediately before the returned pointer. */
            ptr[-1] = (xxh_u8)offset;
            return ptr;
        }
        return NULL;
    }
}
/*
 * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
 * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
 */
static void XXH_alignedFree(void* p)
{
    if (p != NULL) {
        xxh_u8* ptr = (xxh_u8*)p;
        /* Get the offset byte we added in XXH_malloc. */
        xxh_u8 offset = ptr[-1];
        /* Free the original malloc'd pointer */
        xxh_u8* base = ptr - offset;
        XXH_free(base);
    }
}
/*! @ingroup XXH3_family */
/*!
 * @brief Allocate an @ref XXH3_state_t.
 *
 * @return An allocated pointer of @ref XXH3_state_t on success.
 * @return `NULL` on failure.
 *
 * @note Must be freed with XXH3_freeState().
 *
 * @see @ref streaming_example "Streaming Example"
 */
XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
{
    XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
    if (state==NULL) return NULL;
    XXH3_INITSTATE(state);
    return state;
}

/*! @ingroup XXH3_family */
/*!
 * @brief Frees an @ref XXH3_state_t.
 *
 * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
 *
 * @return @ref XXH_OK.
 *
 * @note Must be allocated with XXH3_createState().
 *
 * @see @ref streaming_example "Streaming Example"
 */
XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
{
    XXH_alignedFree(statePtr);
    return XXH_OK;
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API void
XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state)
{
    XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
}

static void
XXH3_reset_internal(XXH3_state_t* statePtr,
                    XXH64_hash_t seed,
                    const void* secret, size_t secretSize)
{
    size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
    size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
    XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);
    XXH_ASSERT(statePtr != NULL);
    /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
    memset((char*)statePtr + initStart, 0, initLength);
    statePtr->acc[0] = XXH_PRIME32_3;
    statePtr->acc[1] = XXH_PRIME64_1;
    statePtr->acc[2] = XXH_PRIME64_2;
    statePtr->acc[3] = XXH_PRIME64_3;
    statePtr->acc[4] = XXH_PRIME64_4;
    statePtr->acc[5] = XXH_PRIME32_2;
    statePtr->acc[6] = XXH_PRIME64_5;
    statePtr->acc[7] = XXH_PRIME32_1;
    statePtr->seed = seed;
    statePtr->useSeed = (seed != 0);
    statePtr->extSecret = (const unsigned char*)secret;
    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
    statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH_errorcode
XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
{
    if (statePtr == NULL) return XXH_ERROR;
    XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
    return XXH_OK;
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH_errorcode
XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
{
    if (statePtr == NULL) return XXH_ERROR;
    XXH3_reset_internal(statePtr, 0, secret, secretSize);
    if (secret == NULL) return XXH_ERROR;
    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
    return XXH_OK;
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH_errorcode
XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
{
    if (statePtr == NULL) return XXH_ERROR;
    if (seed==0) return XXH3_64bits_reset(statePtr);
    if ((seed != statePtr->seed) || (statePtr->extSecret != NULL))
        XXH3_initCustomSecret(statePtr->customSecret, seed);
    XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
    return XXH_OK;
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH_errorcode
XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64)
{
    if (statePtr == NULL) return XXH_ERROR;
    if (secret == NULL) return XXH_ERROR;
    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
    XXH3_reset_internal(statePtr, seed64, secret, secretSize);
    statePtr->useSeed = 1; /* always, even if seed64==0 */
    return XXH_OK;
}

/*!
 * @internal
 * @brief Processes a large input for XXH3_update() and XXH3_digest_long().
 *
 * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block.
 *
 * @param acc                Pointer to the 8 accumulator lanes
 * @param nbStripesSoFarPtr  In/out pointer to the number of leftover stripes in the block*
 * @param nbStripesPerBlock  Number of stripes in a block
 * @param input              Input pointer
 * @param nbStripes          Number of stripes to process
 * @param secret             Secret pointer
 * @param secretLimit        Offset of the last block in @p secret
 * @param f_acc              Pointer to an XXH3_accumulate implementation
 * @param f_scramble         Pointer to an XXH3_scrambleAcc implementation
 * @return                   Pointer past the end of @p input after processing
 */
XXH_FORCE_INLINE const xxh_u8 *
XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
                    size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
                    const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
                    const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
                    XXH3_f_accumulate f_acc,
                    XXH3_f_scrambleAcc f_scramble)
{
    const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE;
    /* Process full blocks */
    if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) {
        /* Process the initial partial block... */
        size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr;

        do {
            /* Accumulate and scramble */
            f_acc(acc, input, initialSecret, nbStripesThisIter);
            f_scramble(acc, secret + secretLimit);
            input += nbStripesThisIter * XXH_STRIPE_LEN;
            nbStripes -= nbStripesThisIter;
            /* Then continue the loop with the full block size */
            nbStripesThisIter = nbStripesPerBlock;
            initialSecret = secret;
        } while (nbStripes >= nbStripesPerBlock);
        *nbStripesSoFarPtr = 0;
    }
    /* Process a partial block */
    if (nbStripes > 0) {
        f_acc(acc, input, initialSecret, nbStripes);
        input += nbStripes * XXH_STRIPE_LEN;
        *nbStripesSoFarPtr += nbStripes;
    }
    /* Return end pointer */
    return input;
}

#ifndef XXH3_STREAM_USE_STACK
# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */
#   define XXH3_STREAM_USE_STACK 1
# endif
#endif
/*
 * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
 */
XXH_FORCE_INLINE XXH_errorcode
XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
            const xxh_u8* XXH_RESTRICT input, size_t len,
            XXH3_f_accumulate f_acc,
            XXH3_f_scrambleAcc f_scramble)
{
    if (input==NULL) {
        XXH_ASSERT(len == 0);
        return XXH_OK;
    }

    XXH_ASSERT(state != NULL);
    {   const xxh_u8* const bEnd = input + len;
        const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
        /* For some reason, gcc and MSVC seem to suffer greatly
         * when operating accumulators directly into state.
         * Operating into stack space seems to enable proper optimization.
         * clang, on the other hand, doesn't seem to need this trick */
        XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8];
        XXH_memcpy(acc, state->acc, sizeof(acc));
#else
        xxh_u64* XXH_RESTRICT const acc = state->acc;
#endif
        state->totalLen += len;
        XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);

        /* small input : just fill in tmp buffer */
        if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) {
            XXH_memcpy(state->buffer + state->bufferedSize, input, len);
            state->bufferedSize += (XXH32_hash_t)len;
            return XXH_OK;
        }

        /* total input is now > XXH3_INTERNALBUFFER_SIZE */
        #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
        XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0);   /* clean multiple */

        /*
         * Internal buffer is partially filled (always, except at beginning)
         * Complete it, then consume it.
         */
        if (state->bufferedSize) {
            size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
            input += loadSize;
            XXH3_consumeStripes(acc,
                               &state->nbStripesSoFar, state->nbStripesPerBlock,
                                state->buffer, XXH3_INTERNALBUFFER_STRIPES,
                                secret, state->secretLimit,
                                f_acc, f_scramble);
            state->bufferedSize = 0;
        }
        XXH_ASSERT(input < bEnd);
        if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
            size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
            input = XXH3_consumeStripes(acc,
                                       &state->nbStripesSoFar, state->nbStripesPerBlock,
                                       input, nbStripes,
                                       secret, state->secretLimit,
                                       f_acc, f_scramble);
            XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);

        }
        /* Some remaining input (always) : buffer it */
        XXH_ASSERT(input < bEnd);
        XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
        XXH_ASSERT(state->bufferedSize == 0);
        XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
        state->bufferedSize = (XXH32_hash_t)(bEnd-input);
#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
        /* save stack accumulators into state */
        XXH_memcpy(state->acc, acc, sizeof(acc));
#endif
    }

    return XXH_OK;
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH_errorcode
XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
{
    return XXH3_update(state, (const xxh_u8*)input, len,
                       XXH3_accumulate, XXH3_scrambleAcc);
}


XXH_FORCE_INLINE void
XXH3_digest_long (XXH64_hash_t* acc,
                  const XXH3_state_t* state,
                  const unsigned char* secret)
{
    xxh_u8 lastStripe[XXH_STRIPE_LEN];
    const xxh_u8* lastStripePtr;

    /*
     * Digest on a local copy. This way, the state remains unaltered, and it can
     * continue ingesting more input afterwards.
     */
    XXH_memcpy(acc, state->acc, sizeof(state->acc));
    if (state->bufferedSize >= XXH_STRIPE_LEN) {
        /* Consume remaining stripes then point to remaining data in buffer */
        size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
        size_t nbStripesSoFar = state->nbStripesSoFar;
        XXH3_consumeStripes(acc,
                           &nbStripesSoFar, state->nbStripesPerBlock,
                            state->buffer, nbStripes,
                            secret, state->secretLimit,
                            XXH3_accumulate, XXH3_scrambleAcc);
        lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN;
    } else {  /* bufferedSize < XXH_STRIPE_LEN */
        /* Copy to temp buffer */
        size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
        XXH_ASSERT(state->bufferedSize > 0);  /* there is always some input buffered */
        XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
        XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
        lastStripePtr = lastStripe;
    }
    /* Last stripe */
    XXH3_accumulate_512(acc,
                        lastStripePtr,
                        secret + state->secretLimit - XXH_SECRET_LASTACC_START);
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
{
    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
    if (state->totalLen > XXH3_MIDSIZE_MAX) {
        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
        XXH3_digest_long(acc, state, secret);
        return XXH3_finalizeLong_64b(acc, secret, (xxh_u64)state->totalLen);
    }
    /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
    if (state->useSeed)
        return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
    return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
                                  secret, state->secretLimit + XXH_STRIPE_LEN);
}
#endif /* !XXH_NO_STREAM */


/* ==========================================
 * XXH3 128 bits (a.k.a XXH128)
 * ==========================================
 * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
 * even without counting the significantly larger output size.
 *
 * For example, extra steps are taken to avoid the seed-dependent collisions
 * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
 *
 * This strength naturally comes at the cost of some speed, especially on short
 * lengths. Note that longer hashes are about as fast as the 64-bit version
 * due to it using only a slight modification of the 64-bit loop.
 *
 * XXH128 is also more oriented towards 64-bit machines. It is still extremely
 * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
 */

XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
{
    /* A doubled version of 1to3_64b with different constants. */
    XXH_ASSERT(input != NULL);
    XXH_ASSERT(1 <= len && len <= 3);
    XXH_ASSERT(secret != NULL);
    /*
     * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
     * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
     * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
     */
    {   xxh_u8 const c1 = input[0];
        xxh_u8 const c2 = input[len >> 1];
        xxh_u8 const c3 = input[len - 1];
        xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)
                                | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
        xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
        xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
        xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
        xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
        xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
        XXH128_hash_t h128;
        h128.low64  = XXH64_avalanche(keyed_lo);
        h128.high64 = XXH64_avalanche(keyed_hi);
        return h128;
    }
}

XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
{
    XXH_ASSERT(input != NULL);
    XXH_ASSERT(secret != NULL);
    XXH_ASSERT(4 <= len && len <= 8);
    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
    {   xxh_u32 const input_lo = XXH_readLE32(input);
        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
        xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
        xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
        xxh_u64 const keyed = input_64 ^ bitflip;

        /* Shift len to the left to ensure it is even, this avoids even multiplies. */
        XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));

        m128.high64 += (m128.low64 << 1);
        m128.low64  ^= (m128.high64 >> 3);

        m128.low64   = XXH_xorshift64(m128.low64, 35);
        m128.low64  *= PRIME_MX2;
        m128.low64   = XXH_xorshift64(m128.low64, 28);
        m128.high64  = XXH3_avalanche(m128.high64);
        return m128;
    }
}

XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
{
    XXH_ASSERT(input != NULL);
    XXH_ASSERT(secret != NULL);
    XXH_ASSERT(9 <= len && len <= 16);
    {   xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
        xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
        xxh_u64 const input_lo = XXH_readLE64(input);
        xxh_u64       input_hi = XXH_readLE64(input + len - 8);
        XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
        /*
         * Put len in the middle of m128 to ensure that the length gets mixed to
         * both the low and high bits in the 128x64 multiply below.
         */
        m128.low64 += (xxh_u64)(len - 1) << 54;
        input_hi   ^= bitfliph;
        /*
         * Add the high 32 bits of input_hi to the high 32 bits of m128, then
         * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
         * the high 64 bits of m128.
         *
         * The best approach to this operation is different on 32-bit and 64-bit.
         */
        if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
            /*
             * 32-bit optimized version, which is more readable.
             *
             * On 32-bit, it removes an ADC and delays a dependency between the two
             * halves of m128.high64, but it generates an extra mask on 64-bit.
             */
            m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
        } else {
            /*
             * 64-bit optimized (albeit more confusing) version.
             *
             * Uses some properties of addition and multiplication to remove the mask:
             *
             * Let:
             *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
             *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
             *    c = XXH_PRIME32_2
             *
             *    a + (b * c)
             * Inverse Property: x + y - x == y
             *    a + (b * (1 + c - 1))
             * Distributive Property: x * (y + z) == (x * y) + (x * z)
             *    a + (b * 1) + (b * (c - 1))
             * Identity Property: x * 1 == x
             *    a + b + (b * (c - 1))
             *
             * Substitute a, b, and c:
             *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
             *
             * Since input_hi.hi + input_hi.lo == input_hi, we get this:
             *    input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
             */
            m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
        }
        /* m128 ^= XXH_swap64(m128 >> 64); */
        m128.low64  ^= XXH_swap64(m128.high64);

        {   /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
            XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
            h128.high64 += m128.high64 * XXH_PRIME64_2;

            h128.low64   = XXH3_avalanche(h128.low64);
            h128.high64  = XXH3_avalanche(h128.high64);
            return h128;
    }   }
}

/*
 * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
 */
XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
{
    XXH_ASSERT(len <= 16);
    {   if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
        if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
        if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
        {   XXH128_hash_t h128;
            xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
            xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
            h128.low64 = XXH64_avalanche(seed ^ bitflipl);
            h128.high64 = XXH64_avalanche( seed ^ bitfliph);
            return h128;
    }   }
}

/*
 * A bit slower than XXH3_mix16B, but handles multiply by zero better.
 */
XXH_FORCE_INLINE XXH128_hash_t
XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
              const xxh_u8* secret, XXH64_hash_t seed)
{
    acc.low64  += XXH3_mix16B (input_1, secret+0, seed);
    acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
    acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
    acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
    return acc;
}


XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
                      XXH64_hash_t seed)
{
    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
    XXH_ASSERT(16 < len && len <= 128);

    {   XXH128_hash_t acc;
        acc.low64 = len * XXH_PRIME64_1;
        acc.high64 = 0;

#if XXH_SIZE_OPT >= 1
        {
            /* Smaller, but slightly slower. */
            unsigned int i = (unsigned int)(len - 1) / 32;
            do {
                acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed);
            } while (i-- != 0);
        }
#else
        if (len > 32) {
            if (len > 64) {
                if (len > 96) {
                    acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
                }
                acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
            }
            acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
        }
        acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
#endif
        {   XXH128_hash_t h128;
            h128.low64  = acc.low64 + acc.high64;
            h128.high64 = (acc.low64    * XXH_PRIME64_1)
                        + (acc.high64   * XXH_PRIME64_4)
                        + ((len - seed) * XXH_PRIME64_2);
            h128.low64  = XXH3_avalanche(h128.low64);
            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
            return h128;
        }
    }
}

XXH_NO_INLINE XXH_PUREF XXH128_hash_t
XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
                       XXH64_hash_t seed)
{
    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);

    {   XXH128_hash_t acc;
        unsigned i;
        acc.low64 = len * XXH_PRIME64_1;
        acc.high64 = 0;
        /*
         *  We set as `i` as offset + 32. We do this so that unchanged
         * `len` can be used as upper bound. This reaches a sweet spot
         * where both x86 and aarch64 get simple agen and good codegen
         * for the loop.
         */
        for (i = 32; i < 160; i += 32) {
            acc = XXH128_mix32B(acc,
                                input  + i - 32,
                                input  + i - 16,
                                secret + i - 32,
                                seed);
        }
        acc.low64 = XXH3_avalanche(acc.low64);
        acc.high64 = XXH3_avalanche(acc.high64);
        /*
         * NB: `i <= len` will duplicate the last 32-bytes if
         * len % 32 was zero. This is an unfortunate necessity to keep
         * the hash result stable.
         */
        for (i=160; i <= len; i += 32) {
            acc = XXH128_mix32B(acc,
                                input + i - 32,
                                input + i - 16,
                                secret + XXH3_MIDSIZE_STARTOFFSET + i - 160,
                                seed);
        }
        /* last bytes */
        acc = XXH128_mix32B(acc,
                            input + len - 16,
                            input + len - 32,
                            secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
                            (XXH64_hash_t)0 - seed);

        {   XXH128_hash_t h128;
            h128.low64  = acc.low64 + acc.high64;
            h128.high64 = (acc.low64    * XXH_PRIME64_1)
                        + (acc.high64   * XXH_PRIME64_4)
                        + ((len - seed) * XXH_PRIME64_2);
            h128.low64  = XXH3_avalanche(h128.low64);
            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
            return h128;
        }
    }
}

static XXH_PUREF XXH128_hash_t
XXH3_finalizeLong_128b(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, xxh_u64 len)
{
    XXH128_hash_t h128;
    h128.low64 = XXH3_finalizeLong_64b(acc, secret, len);
    h128.high64 = XXH3_mergeAccs(acc, secret + secretSize
                                             - XXH_STRIPE_LEN - XXH_SECRET_MERGEACCS_START,
                                             ~(len * XXH_PRIME64_2));
    return h128;
}

XXH_FORCE_INLINE XXH128_hash_t
XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
                            XXH3_f_accumulate f_acc,
                            XXH3_f_scrambleAcc f_scramble)
{
    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;

    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble);

    /* converge into final hash */
    XXH_STATIC_ASSERT(sizeof(acc) == 64);
    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
    return XXH3_finalizeLong_128b(acc, secret, secretSize, (xxh_u64)len);
}

/*
 * It's important for performance that XXH3_hashLong() is not inlined.
 */
XXH_NO_INLINE XXH_PUREF XXH128_hash_t
XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
                           XXH64_hash_t seed64,
                           const void* XXH_RESTRICT secret, size_t secretLen)
{
    (void)seed64; (void)secret; (void)secretLen;
    return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
                                       XXH3_accumulate, XXH3_scrambleAcc);
}

/*
 * It's important for performance to pass @p secretLen (when it's static)
 * to the compiler, so that it can properly optimize the vectorized loop.
 *
 * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
 * breaks -Og, this is XXH_NO_INLINE.
 */
XXH3_WITH_SECRET_INLINE XXH128_hash_t
XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
                              XXH64_hash_t seed64,
                              const void* XXH_RESTRICT secret, size_t secretLen)
{
    (void)seed64;
    return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
                                       XXH3_accumulate, XXH3_scrambleAcc);
}

XXH_FORCE_INLINE XXH128_hash_t
XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
                                XXH64_hash_t seed64,
                                XXH3_f_accumulate f_acc,
                                XXH3_f_scrambleAcc f_scramble,
                                XXH3_f_initCustomSecret f_initSec)
{
    if (seed64 == 0)
        return XXH3_hashLong_128b_internal(input, len,
                                           XXH3_kSecret, sizeof(XXH3_kSecret),
                                           f_acc, f_scramble);
    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
        f_initSec(secret, seed64);
        return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
                                           f_acc, f_scramble);
    }
}

/*
 * It's important for performance that XXH3_hashLong is not inlined.
 */
XXH_NO_INLINE XXH128_hash_t
XXH3_hashLong_128b_withSeed(const void* input, size_t len,
                            XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen)
{
    (void)secret; (void)secretLen;
    return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
                XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
}

typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,
                                            XXH64_hash_t, const void* XXH_RESTRICT, size_t);

XXH_FORCE_INLINE XXH128_hash_t
XXH3_128bits_internal(const void* input, size_t len,
                      XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
                      XXH3_hashLong128_f f_hl128)
{
    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
    /*
     * If an action is to be taken if `secret` conditions are not respected,
     * it should be done here.
     * For now, it's a contract pre-condition.
     * Adding a check and a branch here would cost performance at every hash.
     */
    if (len <= 16)
        return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
    if (len <= 128)
        return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
    if (len <= XXH3_MIDSIZE_MAX)
        return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
    return f_hl128(input, len, seed64, secret, secretLen);
}


/* ===   Public XXH128 API   === */

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len)
{
    return XXH3_128bits_internal(input, len, 0,
                                 XXH3_kSecret, sizeof(XXH3_kSecret),
                                 XXH3_hashLong_128b_default);
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH128_hash_t
XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize)
{
    return XXH3_128bits_internal(input, len, 0,
                                 (const xxh_u8*)secret, secretSize,
                                 XXH3_hashLong_128b_withSecret);
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH128_hash_t
XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
{
    return XXH3_128bits_internal(input, len, seed,
                                 XXH3_kSecret, sizeof(XXH3_kSecret),
                                 XXH3_hashLong_128b_withSeed);
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH128_hash_t
XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
{
    if (len <= XXH3_MIDSIZE_MAX)
        return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
    return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH128_hash_t
XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
{
    return XXH3_128bits_withSeed(input, len, seed);
}


/* ===   XXH3 128-bit streaming   === */
#ifndef XXH_NO_STREAM
/*
 * All initialization and update functions are identical to 64-bit streaming variant.
 * The only difference is the finalization routine.
 */

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH_errorcode
XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
{
    return XXH3_64bits_reset(statePtr);
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH_errorcode
XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
{
    return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH_errorcode
XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
{
    return XXH3_64bits_reset_withSeed(statePtr, seed);
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH_errorcode
XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
{
    return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH_errorcode
XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
{
    return XXH3_64bits_update(state, input, len);
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
{
    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
    if (state->totalLen > XXH3_MIDSIZE_MAX) {
        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
        XXH3_digest_long(acc, state, secret);
        XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
        return XXH3_finalizeLong_128b(acc, secret, state->secretLimit + XXH_STRIPE_LEN,  (xxh_u64)state->totalLen);
    }
    /* len <= XXH3_MIDSIZE_MAX : short code */
    if (state->useSeed)
        return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
    return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
                                   secret, state->secretLimit + XXH_STRIPE_LEN);
}
#endif /* !XXH_NO_STREAM */
/* 128-bit utility functions */

#include <string.h>   /* memcmp, memcpy */

/* return : 1 is equal, 0 if different */
/*! @ingroup XXH3_family */
XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
{
    /* note : XXH128_hash_t is compact, it has no padding byte */
    return !(memcmp(&h1, &h2, sizeof(h1)));
}

/* This prototype is compatible with stdlib's qsort().
 * @return : >0 if *h128_1  > *h128_2
 *           <0 if *h128_1  < *h128_2
 *           =0 if *h128_1 == *h128_2  */
/*! @ingroup XXH3_family */
XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2)
{
    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
    /* note : bets that, in most cases, hash values are different */
    if (hcmp) return hcmp;
    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
}


/*======   Canonical representation   ======*/
/*! @ingroup XXH3_family */
XXH_PUBLIC_API void
XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash)
{
    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
    if (XXH_CPU_LITTLE_ENDIAN) {
        hash.high64 = XXH_swap64(hash.high64);
        hash.low64  = XXH_swap64(hash.low64);
    }
    XXH_memcpy(dst, &hash.high64, sizeof(hash.high64));
    XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH128_hash_t
XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src)
{
    XXH128_hash_t h;
    h.high64 = XXH_readBE64(src);
    h.low64  = XXH_readBE64(src->digest + 8);
    return h;
}


/* ==========================================
 * Secret generators
 * ==========================================
 */
#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))

XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)
{
    XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 );
    XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 );
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API XXH_errorcode
XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize)
{
#if (XXH_DEBUGLEVEL >= 1)
    XXH_ASSERT(secretBuffer != NULL);
    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
#else
    /* production mode, assert() are disabled */
    if (secretBuffer == NULL) return XXH_ERROR;
    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
#endif

    if (customSeedSize == 0) {
        customSeed = XXH3_kSecret;
        customSeedSize = XXH_SECRET_DEFAULT_SIZE;
    }
#if (XXH_DEBUGLEVEL >= 1)
    XXH_ASSERT(customSeed != NULL);
#else
    if (customSeed == NULL) return XXH_ERROR;
#endif

    /* Fill secretBuffer with a copy of customSeed - repeat as needed */
    {   size_t pos = 0;
        while (pos < secretSize) {
            size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize);
            memcpy((char*)secretBuffer + pos, customSeed, toCopy);
            pos += toCopy;
    }   }

    {   size_t const nbSeg16 = secretSize / 16;
        size_t n;
        XXH128_canonical_t scrambler;
        XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
        for (n=0; n<nbSeg16; n++) {
            XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n);
            XXH3_combine16((char*)secretBuffer + n*16, h128);
        }
        /* last segment */
        XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler));
    }
    return XXH_OK;
}

/*! @ingroup XXH3_family */
XXH_PUBLIC_API void
XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed)
{
    XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
    XXH3_initCustomSecret(secret, seed);
    XXH_ASSERT(secretBuffer != NULL);
    memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE);
}


/* Pop our optimization override from above */
#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
  && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
#  pragma GCC pop_options
#endif

#endif  /* XXH_NO_LONG_LONG */

#endif  /* XXH_NO_XXH3 */

/*!
 * @}
 */
#endif  /* XXH_IMPLEMENTATION */


#if defined (__cplusplus)
} /* extern "C" */
#endif


================================================
FILE: mum-prng.h
================================================
/* Copyright (c) 2016, 2017, 2018
   Vladimir Makarov <vmakarov@gcc.gnu.org>

   Permission is hereby granted, free of charge, to any person
   obtaining a copy of this software and associated documentation
   files (the "Software"), to deal in the Software without
   restriction, including without limitation the rights to use, copy,
   modify, merge, publish, distribute, sublicense, and/or sell copies
   of the Software, and to permit persons to whom the Software is
   furnished to do so, subject to the following conditions:

   The above copyright notice and this permission notice shall be
   included in all copies or substantial portions of the Software.

   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   SOFTWARE.
*/

/* Pseudo Random Number Generator (PRNG) based on MUM hash function.
   It is not a crypto level PRNG.

   To use a generator call `init_mum_prng` first, then call
   `get_mum_prn` as much as you want to get a new PRN.  At the end of
   the PRNG use, call `finish_mum_prng`.  You can change the default
   seed by calling set_mum_seed.

   The PRNG passes NIST Statistical Test Suite for Random and
   Pseudorandom Number Generators for Cryptographic Applications
   (version 2.2.1) with 1000 bitstreams each containing 1M bits.

   The generation of a new number takes about 3.5 CPU cycles on x86_64
   (Intel 4.2GHz i7-4790K), or speed of the generation is about 1120M
   numbers per sec.  So it is very fast.  */

#ifndef __MUM_PRNG__
#define __MUM_PRNG__

#include "mum.h"

#ifndef MUM_PRNG_UNROLL
#define MUM_PRNG_UNROLL 16
#endif

#if MUM_PRNG_UNROLL < 1 || MUM_PRNG_UNROLL > 16
#error "wrong MUM_PRNG_UNROLL value"
#endif

#ifdef __GNUC__
#define EXPECT(cond, v) __builtin_expect((cond), (v))
#else
#define EXPECT(cond, v) (cond)
#endif

#if defined(__GNUC__) && ((__GNUC__ == 4) &&  (__GNUC_MINOR__ >= 9) || (__GNUC__ > 4))
#define _MUM_PRNG_FRESH_GCC
#endif

static struct {
  int count; 
  void (*update_func) (void);
  /* MUM PRNG state */
  uint64_t state[MUM_PRNG_UNROLL]; 
} _mum_prng_state;

#if defined(__x86_64__) && defined(_MUM_PRNG_FRESH_GCC)
/* This code specialized for Haswell generates MULX insns. */
static inline uint64_t _MUM_TARGET("arch=haswell")
_mum_avx2 (uint64_t v, uint64_t p) {
  uint64_t hi, lo;
  __uint128_t r = (__uint128_t) v * (__uint128_t) p;
  hi = (uint64_t) (r >> 64);
  lo = (uint64_t) r;
  return hi + lo;
}

static void _MUM_TARGET("arch=haswell")
_mum_prng_update_avx2 (void) {
  int i;

  _mum_prng_state.count = 0;
  for (i = 0; i < MUM_PRNG_UNROLL - 1; i++)
    _mum_prng_state.state[i] ^= _mum_avx2 (_mum_prng_state.state[i + 1], _mum_primes[i]);
  _mum_prng_state.state[MUM_PRNG_UNROLL - 1] ^= _mum_avx2 (_mum_prng_state.state[0], _mum_primes[MUM_PRNG_UNROLL - 1]);
}
#endif

static void __attribute__ ((noinline))
_mum_prng_update (void) {
  int i;

  _mum_prng_state.count = 0;
  for (i = 0; i < MUM_PRNG_UNROLL - 1; i++)
    _mum_prng_state.state[i] ^= _mum (_mum_prng_state.state[i + 1], _mum_primes[i]);
  _mum_prng_state.state[MUM_PRNG_UNROLL - 1] ^= _mum (_mum_prng_state.state[0], _mum_primes[MUM_PRNG_UNROLL - 1]);
}

#if defined(__x86_64__) && defined(_MUM_PRNG_FRESH_GCC)
static inline void
_mum_prng_setup_avx2 (void) {
  __builtin_cpu_init ();
  if (__builtin_cpu_supports ("avx2"))
    _mum_prng_state.update_func = _mum_prng_update_avx2;
  else
    _mum_prng_state.update_func = _mum_prng_update;

}
#endif

static inline void
_start_mum_prng (uint32_t seed) {
  int i;

  _mum_prng_state.count = MUM_PRNG_UNROLL;
  for (i = 0; i < MUM_PRNG_UNROLL; i++)
    _mum_prng_state.state[i] = seed + 1;
#if defined(__x86_64__) && defined(_MUM_PRNG_FRESH_GCC)
  _mum_prng_setup_avx2 ();
#else
  _mum_prng_state.update_func = _mum_prng_update;
#endif
}

static inline void
init_mum_prng (void) {
  _start_mum_prng (0);
}

static inline void
set_mum_prng_seed (uint32_t seed) {
  _start_mum_prng (seed);
}

static inline uint64_t
get_mum_prn (void) {
  if (EXPECT (_mum_prng_state.count == MUM_PRNG_UNROLL, 0)) {
    _mum_prng_state.update_func ();
    _mum_prng_state.count = 1;
    return _mum_prng_state.state[0];
  }
  return _mum_prng_state.state[_mum_prng_state.count++];
}

static inline void
finish_mum_prng (void) {
}

#endif


================================================
FILE: mum.h
================================================
/* Copyright (c) 2016-2025
   Vladimir Makarov <vmakarov@gcc.gnu.org>

   Permission is hereby granted, free of charge, to any person
   obtaining a copy of this software and associated documentation
   files (the "Software"), to deal in the Software without
   restriction, including without limitation the rights to use, copy,
   modify, merge, publish, distribute, sublicense, and/or sell copies
   of the Software, and to permit persons to whom the Software is
   furnished to do so, subject to the following conditions:

   The above copyright notice and this permission notice shall be
   included in all copies or substantial portions of the Software.

   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   SOFTWARE.
*/

/* This file implements MUM (MUltiply and Mix) hashing. We randomize input data by 64x64-bit
   multiplication and mixing hi- and low-parts of the multiplication result by using an addition and
   then mix it into the current state. We use prime numbers randomly generated with the equal
   probability of their bit values for the multiplication. When all primes are used once, the state
   is randomized and the same prime numbers are used again for data randomization.

   The MUM hashing passes all SMHasher tests. Pseudo Random Number Generator based on MUM also
   passes NIST Statistical Test Suite for Random and Pseudorandom Number Generators for
   Cryptographic Applications (version 2.2.1) with 1000 bitstreams each containing 1M bits. MUM
   hashing is also faster Spooky64 and City64 on small strings (at least upto 512-bit) on Haswell
   and Power7. The MUM bulk speed (speed on very long data) is bigger than Spooky and City on
   Power7. On Haswell the bulk speed is bigger than Spooky one and close to City speed. */

#ifndef __MUM_HASH__
#define __MUM_HASH__

#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>

#ifdef _MSC_VER
typedef unsigned __int16 uint16_t;
typedef unsigned __int32 uint32_t;
typedef unsigned __int64 uint64_t;
#else
#include <stdint.h>
#endif

#ifdef __GNUC__
#define _MUM_ATTRIBUTE_UNUSED __attribute__ ((unused))
#define _MUM_INLINE inline __attribute__ ((always_inline))
#else
#define _MUM_ATTRIBUTE_UNUSED
#define _MUM_INLINE inline
#endif

#if defined(MUM_QUALITY) && !defined(MUM_TARGET_INDEPENDENT_HASH)
#define MUM_TARGET_INDEPENDENT_HASH
#endif

/* Macro saying to use 128-bit integers implemented by GCC for some targets. */
#ifndef _MUM_USE_INT128
/* In GCC uint128_t is defined if HOST_BITS_PER_WIDE_INT >= 64. HOST_WIDE_INT is long if
   HOST_BITS_PER_LONG > HOST_BITS_PER_INT, otherwise int. */
#if defined(__GNUC__) && UINT_MAX != ULONG_MAX
#define _MUM_USE_INT128 1
#else
#define _MUM_USE_INT128 0
#endif
#endif

/* Here are different primes randomly generated with the equal probability of their bit values. They
   are used to randomize input values. */
static uint64_t _mum_hash_step_prime = 0x2e0bb864e9ea7df5ULL;
static uint64_t _mum_key_step_prime = 0xcdb32970830fcaa1ULL;
static uint64_t _mum_block_start_prime = 0xc42b5e2e6480b23bULL;
static uint64_t _mum_unroll_prime = 0x7b51ec3d22f7096fULL;
static uint64_t _mum_tail_prime = 0xaf47d47c99b1461bULL;
static uint64_t _mum_finish_prime1 = 0xa9a7ae7ceff79f3fULL;
static uint64_t _mum_finish_prime2 = 0xaf47d47c99b1461bULL;

static uint64_t _mum_primes[] = {
  0X9ebdcae10d981691, 0X32b9b9b97a27ac7d, 0X29b5584d83d35bbd, 0X4b04e0e61401255f,
  0X25e8f7b1f1c9d027, 0X80d4c8c000f3e881, 0Xbd1255431904b9dd, 0X8a3bd4485eee6d81,
  0X3bc721b2aad05197, 0X71b1a19b907d6e33, 0X525e6c1084a8534b, 0X9e4c2cd340c1299f,
  0Xde3add92e94caa37, 0X7e14eadb1f65311d, 0X3f5aa40f89812853, 0X33b15a3b587d15c9,
};

/* Multiply 64-bit V and P and return sum of high and low parts of the result. */
static _MUM_INLINE uint64_t _mum (uint64_t v, uint64_t p) {
  uint64_t hi, lo;
#if _MUM_USE_INT128
  __uint128_t r = (__uint128_t) v * (__uint128_t) p;
  hi = (uint64_t) (r >> 64);
  lo = (uint64_t) r;
#else
  /* Implementation of 64x64->128-bit multiplication by four 32x32->64 bit multiplication. */
  uint64_t hv = v >> 32, hp = p >> 32;
  uint64_t lv = (uint32_t) v, lp = (uint32_t) p;
  uint64_t rh = hv * hp;
  uint64_t rm_0 = hv * lp;
  uint64_t rm_1 = hp * lv;
  uint64_t rl = lv * lp;
  uint64_t t, carry = 0;

  /* We could ignore a carry bit here if we did not care about the same hash for 32-bit and 64-bit
     targets. */
  t = rl + (rm_0 << 32);
#ifdef MUM_TARGET_INDEPENDENT_HASH
  carry = t < rl;
#endif
  lo = t + (rm_1 << 32);
#ifdef MUM_TARGET_INDEPENDENT_HASH
  carry += lo < t;
#endif
  hi = rh + (rm_0 >> 32) + (rm_1 >> 32) + carry;
#endif
  /* We could use XOR here too but, for some reasons, on Haswell and Power7 using an addition
     improves hashing performance by 10% for small strings. */
  return hi + lo;
}

#if defined(_MSC_VER)
#define _mum_bswap32(x) _byteswap_uint32_t (x)
#define _mum_bswap64(x) _byteswap_uint64_t (x)
#elif defined(__APPLE__)
#include <libkern/OSByteOrder.h>
#define _mum_bswap32(x) OSSwapInt32 (x)
#define _mum_bswap64(x) OSSwapInt64 (x)
#elif defined(__GNUC__)
#define _mum_bswap32(x) __builtin_bswap32 (x)
#define _mum_bswap64(x) __builtin_bswap64 (x)
#else
#include <byteswap.h>
#define _mum_bswap32(x) bswap32 (x)
#define _mum_bswap64(x) bswap64 (x)
#endif

static _MUM_INLINE uint64_t _mum_le (uint64_t v) {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
  return v;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  return _mum_bswap64 (v);
#else
#error "Unknown endianess"
#endif
}

static _MUM_INLINE uint32_t _mum_le32 (uint32_t v) {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
  return v;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  return _mum_bswap32 (v);
#else
#error "Unknown endianess"
#endif
}

static _MUM_INLINE uint64_t _mum_le16 (uint16_t v) {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
  return v;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  return (v >> 8) | ((v & 0xff) << 8);
#else
#error "Unknown endianess"
#endif
}

/* Macro defining how many times the most nested loop in _mum_hash_aligned will be unrolled by the
   compiler (although it can make an own decision:). Use only a constant here to help a compiler to
   unroll a major loop.

   The macro value affects the result hash for strings > 128 bit. The unroll factor greatly affects
   the hashing speed. We prefer the speed. */
#ifndef _MUM_UNROLL_FACTOR_POWER
#if defined(__PPC64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
#define _MUM_UNROLL_FACTOR_POWER 3
#elif defined(__aarch64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
#define _MUM_UNROLL_FACTOR_POWER 4
#elif defined(MUM_V1) || defined(MUM_V2)
#define _MUM_UNROLL_FACTOR_POWER 2
#else
#define _MUM_UNROLL_FACTOR_POWER 3
#endif
#endif

#if _MUM_UNROLL_FACTOR_POWER < 1
#error "too small unroll factor"
#elif _MUM_UNROLL_FACTOR_POWER > 4
#error "We have not enough primes for such unroll factor"
#endif

#define _MUM_UNROLL_FACTOR (1 << _MUM_UNROLL_FACTOR_POWER)

/* Rotate V left by SH. */
static _MUM_INLINE uint64_t _mum_rotl (uint64_t v, int sh) { return v << sh | v >> (64 - sh); }

static _MUM_INLINE uint64_t _mum_xor (uint64_t a, uint64_t b) {
#ifdef MUM_V3
  return a ^ b;
#else
  return (a ^ b) != 0 ? a ^ b : b;
#endif
}

#if defined(MUM_V1) || defined(MUM_V2) || !defined(MUM_QUALITY)
#define _MUM_TAIL_START(v) 0
#else
#define _MUM_TAIL_START(v) v
#endif
static _MUM_INLINE uint64_t
#if defined(__GNUC__) && !defined(__clang__)
  __attribute__ ((__optimize__ ("unroll-loops")))
#endif
  _mum_hash_aligned (uint64_t start, const void *key, size_t len) {
  uint64_t result = start;
  const unsigned char *str = (const unsigned char *) key;
  uint64_t u64;
  size_t i;
  size_t n;

#ifndef MUM_V2
  result = _mum (result, _mum_block_start_prime);
#endif
  while (len > _MUM_UNROLL_FACTOR * sizeof (uint64_t)) {
    /* This loop could be vectorized when we have vector insns for 64x64->128-bit multiplication.
       AVX2 currently only have vector insns for 4 32x32->64-bit multiplication and for 1
       64x64->128-bit multiplication (pclmulqdq). */
#if defined(MUM_V1) || defined(MUM_V2)
    for (i = 0; i < _MUM_UNROLL_FACTOR; i++)
      result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
#else
    for (i = 0; i < _MUM_UNROLL_FACTOR; i += 2)
      result ^= _mum (_mum_xor (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]),
                      _mum_xor (_mum_le (((uint64_t *) str)[i + 1]), _mum_primes[i + 1]));
#endif
    len -= _MUM_UNROLL_FACTOR * sizeof (uint64_t);
    str += _MUM_UNROLL_FACTOR * sizeof (uint64_t);
    /* We will use the same prime numbers on the next iterations -- randomize the state. */
    result = _mum (result, _mum_unroll_prime);
  }
  n = len / sizeof (uint64_t);
#if defined(MUM_V1) || defined(MUM_V2) || !defined(MUM_QUALITY)
  for (i = 0; i < n; i++) result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
#else
  for (i = 0; i < n; i++)
    result ^= _mum (_mum_le (((uint64_t *) str)[i]) + _mum_primes[i], _mum_primes[i]);
#endif
  len -= n * sizeof (uint64_t);
  str += n * sizeof (uint64_t);
  switch (len) {
  case 7:
    u64 = _MUM_TAIL_START (_mum_primes[0]) + _mum_le32 (*(uint32_t *) str);
    u64 += _mum_le16 (*(uint16_t *) (str + 4)) << 32;
    u64 += (uint64_t) str[6] << 48;
    return result ^ _mum (u64, _mum_tail_prime);
  case 6:
    u64 = _MUM_TAIL_START (_mum_primes[1]) + _mum_le32 (*(uint32_t *) str);
    u64 += _mum_le16 (*(uint16_t *) (str + 4)) << 32;
    return result ^ _mum (u64, _mum_tail_prime);
  case 5:
    u64 = _MUM_TAIL_START (_mum_primes[2]) + _mum_le32 (*(uint32_t *) str);
    u64 += (uint64_t) str[4] << 32;
    return result ^ _mum (u64, _mum_tail_prime);
  case 4:
    u64 = _MUM_TAIL_START (_mum_primes[3]) + _mum_le32 (*(uint32_t *) str);
    return result ^ _mum (u64, _mum_tail_prime);
  case 3:
    u64 = _MUM_TAIL_START (_mum_primes[4]) + _mum_le16 (*(uint16_t *) str);
    u64 += (uint64_t) str[2] << 16;
    return result ^ _mum (u64, _mum_tail_prime);
  case 2:
    u64 = _MUM_TAIL_START (_mum_primes[5]) + _mum_le16 (*(uint16_t *) str);
    return result ^ _mum (u64, _mum_tail_prime);
  case 1:
    u64 = _MUM_TAIL_START (_mum_primes[6]) + str[0];
    return result ^ _mum (u64, _mum_tail_prime);
  }
  return result;
}

/* Final randomization of H. */
static _MUM_INLINE uint64_t _mum_final (uint64_t h) {
#if defined(MUM_V1)
  h ^= _mum (h, _mum_finish_prime1);
  h ^= _mum (h, _mum_finish_prime2);
#elif defined(MUM_V2)
  h ^= _mum_rotl (h, 33);
  h ^= _mum (h, _mum_finish_prime1);
#else
  h = _mum (h, h);
#endif
  return h;
}

#ifndef _MUM_UNALIGNED_ACCESS
#if defined(__x86_64__) || defined(__i386__) || defined(__PPC64__) || defined(__s390__) \
  || defined(__m32c__) || defined(cris) || defined(__CR16__) || defined(__vax__)        \
  || defined(__m68k__) || defined(__aarch64__) || defined(_M_AMD64) || defined(_M_IX86)
#define _MUM_UNALIGNED_ACCESS 1
#else
#define _MUM_UNALIGNED_ACCESS 0
#endif
#endif

/* When we need an aligned access to data being hashed we move part of the unaligned data to an
   aligned block of given size and then process it, repeating processing the data by the block. */
#ifndef _MUM_BLOCK_LEN
#define _MUM_BLOCK_LEN 1024
#endif

#if _MUM_BLOCK_LEN < 8
#error "too small block length"
#endif

static _MUM_INLINE uint64_t
#if defined(__x86_64__) && defined(__GNUC__) && !defined(__clang__)
  __attribute__ ((__target__ ("inline-all-stringops")))
#endif
  _mum_hash_default (const void *key, size_t len, uint64_t seed) {
  uint64_t result;
  const unsigned char *str = (const unsigned char *) key;
  size_t block_len;
  uint64_t buf[_MUM_BLOCK_LEN / sizeof (uint64_t)];

  result = seed + len;
  if (((size_t) str & 0x7) == 0)
    result = _mum_hash_aligned (result, key, len);
  else {
    while (len != 0) {
      block_len = len < _MUM_BLOCK_LEN ? len : _MUM_BLOCK_LEN;
      memcpy (buf, str, block_len);
      result = _mum_hash_aligned (result, buf, block_len);
      len -= block_len;
      str += block_len;
    }
  }
  return _mum_final (result);
}

static _MUM_INLINE uint64_t _mum_next_factor (void) {
  uint64_t start = 0;
  int i;

  for (i = 0; i < 8; i++) start = (start << 8) | rand () % 256;
  return start;
}

/* ++++++++++++++++++++++++++ Interface functions: +++++++++++++++++++  */

/* Set random multiplicators depending on SEED. */
static _MUM_INLINE void mum_hash_randomize (uint64_t seed) {
  size_t i;

  srand (seed);
  _mum_hash_step_prime = _mum_next_factor ();
  _mum_key_step_prime = _mum_next_factor ();
  _mum_finish_prime1 = _mum_next_factor ();
  _mum_finish_prime2 = _mum_next_factor ();
  _mum_block_start_prime = _mum_next_factor ();
  _mum_unroll_prime = _mum_next_factor ();
  _mum_tail_prime = _mum_next_factor ();
  for (i = 0; i < sizeof (_mum_primes) / sizeof (uint64_t); i++)
    _mum_primes[i] = _mum_next_factor ();
}

/* Start hashing data with SEED. Return the state. */
static _MUM_INLINE uint64_t mum_hash_init (uint64_t seed) { return seed; }

/* Process data KEY with the state H and return the updated state. */
static _MUM_INLINE uint64_t mum_hash_step (uint64_t h, uint64_t key) {
  return _mum (h, _mum_hash_step_prime) ^ _mum (key, _mum_key_step_prime);
}

/* Return the result of hashing using the current state H. */
static _MUM_INLINE uint64_t mum_hash_finish (uint64_t h) { return _mum_final (h); }

/* Fast hashing of KEY with SEED. The hash is always the same for the same key on any target. */
static _MUM_INLINE size_t mum_hash64 (uint64_t key, uint64_t seed) {
  return mum_hash_finish (mum_hash_step (mum_hash_init (seed), key));
}

/* Hash data KEY of length LEN and SEED. The hash depends on the target endianess and the unroll
   factor. */
static _MUM_INLINE uint64_t mum_hash (const void *key, size_t len, uint64_t seed) {
#if _MUM_UNALIGNED_ACCESS
  return _mum_final (_mum_hash_aligned (seed + len, key, len));
#else
  return _mum_hash_default (key, len, seed);
#endif
}

#endif


================================================
FILE: mum512.h
================================================
/* Copyright (c) 2016, 2017, 2018
   Vladimir Makarov <vmakarov@gcc.gnu.org>

   Permission is hereby granted, free of charge, to any person
   obtaining a copy of this software and associated documentation
   files (the "Software"), to deal in the Software without
   restriction, including without limitation the rights to use, copy,
   modify, merge, publish, distribute, sublicense, and/or sell copies
   of the Software, and to permit persons to whom the Software is
   furnished to do so, subject to the following conditions:

   The above copyright notice and this permission notice shall be
   included in all copies or substantial portions of the Software.

   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   SOFTWARE.
*/

/* This file implements MUM512 (MUltiply and Mix) hash function.  It
   is a candidate for a crypto-level hash function and keyed hash
   function.

   The key size is 256-bit.  The size of the state, block, and output
   (digest) is 512-bit.

   The input data by 128x128-bit multiplication and mixing hi- and
   low-parts of the multiplication result by using an addition and
   then mix it into the current state.  We use prime numbers randomly
   generated with the equal probability of their bit values for the
   multiplication.  When all primes are used once, the state is
   randomized and the same prime numbers are used again for data
   randomization.

   The MUM512 hashing passes all SMHasher tests and NIST Statistical
    Test Suite for Random and Pseudorandom Number Generators for
    Cryptographic Applications (v. 2.2.1).  MUM512 is also faster SHA-2
    (512) and SHA-3 (512).  */

#ifndef __MC_HASH__
#define __MC_HASH__

#include <stddef.h>
#include <string.h>
#include <limits.h>

#if defined(_MSC_VER)
typedef unsigned __int32 uint32_t;
typedef unsigned __int64 uint64_t;
#else
#include <stdint.h>
#endif

#if defined(_MSC_VER)
#define _mc_bswap_64(x) _byteswap_uint64_t (x)
#elif defined(__APPLE__)
#include <libkern/OSByteOrder.h>
#define _mc_bswap_64(x) OSSwapInt64 (x)
#elif defined(__GNUC__)
#define _mc_bswap64(x) __builtin_bswap64 (x)
#else
#include <byteswap.h>
#define _mc_bswap64(x) bswap64 (x)
#endif

#ifndef _MC_USE_INT128
/* In GCC if uint128_t is defined if HOST_BITS_PER_WIDE_INT >= 64.
   HOST_WIDE_INT is long if HOST_BITS_PER_LONG > HOST_BITS_PER_IN,
   otherwise int. */
#if defined(__GNUC__) && UINT_MAX != ULONG_MAX
#define _MC_USE_INT128 1
#else
#define _MC_USE_INT128 0
#endif
#endif

#ifdef __GNUC__
#define _MC_ATTRIBUTE_UNUSED  __attribute__((unused))
#define _MC_OPTIMIZE(opts) __attribute__((__optimize__ (opts)))
#define _MC_TARGET(opts) __attribute__((__target__ (opts)))
#else
#define _MC_ATTRIBUTE_UNUSED
#define _MC_OPTIMIZE(opts)
#define _MC_TARGET(opts)
#endif


#if _MC_USE_INT128
typedef __uint128_t _mc_ti;
#else
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
typedef struct {uint64_t lo, hi;} _mc_ti;
#else
typedef struct {uint64_t hi, lo;} _mc_ti;
#endif
#endif

#define inline

#if _MC_USE_INT128
static inline uint64_t _mc_lo64 (_mc_ti a) {return a;}
static inline uint64_t _mc_hi64 (_mc_ti a) {return a >> 64;}
static inline _mc_ti _mc_lo2hi (_mc_ti a) {return a << 64;}
static inline _mc_ti _mc_hi2lo (_mc_ti a) {return a >> 64;}
static inline _mc_ti _mc_add (_mc_ti a, _mc_ti b) {return a + b;}
static inline _mc_ti _mc_xor (_mc_ti a, _mc_ti b) {return a ^ b;}
static inline uint32_t _mc_lt (_mc_ti a, _mc_ti b) { return a < b;}
static inline _mc_ti _mc_const (uint64_t hi, uint64_t lo) {
  return (_mc_ti) hi << 64 | lo;
}
static inline _mc_ti _mc_rotr (_mc_ti a, int sh) {
  return (a >> sh) | (a << (128 - sh));
}

static inline _mc_ti _mc_mul64 (uint64_t a, uint64_t b) {
#if defined(__aarch64__)
  /* AARCH64 needs 2 insns to calculate 128-bit result of the
     multiplication.  If we use a generic code we actually call a
     function doing 128x128->128 bit multiplication.  The function is
     very slow.  */
  uint64_t lo = a * b, hi;

  asm ("umulh %0, %1, %2" : "=r" (hi) : "r" (a), "r" (b));
  return (_mc_ti) hi << 64 | lo;
#else
  return (_mc_ti) a * (_mc_ti) b;
#endif
}

static inline _mc_ti _mc_swap (_mc_ti v) {
  return _mc_const (_mc_bswap64 (_mc_lo64 (v)), _mc_bswap64 (_mc_hi64 (v)));
}

#else /* #if _MC_USE_INT128 */

static inline uint64_t _mc_lo64 (_mc_ti a) {return a.lo;}
static inline uint64_t _mc_hi64 (_mc_ti a) {return a.hi;}
static inline _mc_ti _mc_lo2hi (_mc_ti a) {a.hi = a.lo; a.lo = 0; return a;}
static inline _mc_ti _mc_hi2lo (_mc_ti a) {a.lo = a.hi; a.hi = 0; return a;}
static inline _mc_ti _mc_const (uint64_t hi, uint64_t lo) {
  _mc_ti r;
  r.hi = hi; r.lo = lo; return r;
}
static inline _mc_ti _mc_add (_mc_ti a, _mc_ti b) {
  a.lo += b.lo, a.hi += b.hi + (a.lo < b.lo); return a;
}
static inline uint32_t _mc_lt (_mc_ti a, _mc_ti b) {
  return a.hi < b.hi || (a.hi == b.hi && a.lo < b.lo);
}
static inline _mc_ti _mc_xor (_mc_ti a, _mc_ti b) {
  a.lo ^= b.lo; a.hi ^= b.hi; return a;
}
static inline _mc_ti _mc_rotr (_mc_ti a, int sh) {
  _mc_ti res;

  if (sh <= 64) {
    res.lo = a.lo >> sh | a.hi << (64 - sh);
    res.hi = a.hi >> sh | a.lo << (64 - sh);
  } else {
    sh -= 64;
    res.lo = a.hi >> sh | a.lo << (64 - sh);
    res.hi = a.lo >> sh | a.hi << (64 - sh);
  }
  return res;
}

static inline _mc_ti _mc_mul64 (uint64_t a, uint64_t b) {
  /* Implementation of 64x64->128-bit multiplication by four 32x32->64
     bit multiplication.  */
  uint64_t ha = a >> 32, hb = b >> 32;
  uint64_t la = (uint32_t) a, lb = (uint32_t) b;
  uint64_t rhi =  ha * hb;
  uint64_t rm_0 = ha * lb;
  uint64_t rm_1 = hb * la;
  uint64_t rlo =  la * lb;
  uint64_t lo, carry;
  _mc_ti res;
  
  lo = rlo + (rm_0 << 32);
  carry = lo < rlo;
  res.lo = lo + (rm_1 << 32);
  carry += res.lo < lo;
  res.hi = rhi + (rm_0 >> 32) + (rm_1 >> 32) + carry;
  return res;
}

static inline _mc_ti _mc_swap (_mc_ti v) {
  _mc_ti r;
  r.lo = _mc_bswap64 (v.hi); r.hi = _mc_bswap64 (v.lo); return r;
}

#endif /* #if _MC_USE_INT128 */

static inline _mc_ti _mc_mum (_mc_ti a, _mc_ti b) {
  /* Implementation of 128x128->256-bit multiplication by four
     64x64->128 bit multiplication.  */
  uint64_t ha = _mc_hi64 (a), hb = _mc_hi64 (b);
  uint64_t la = _mc_lo64 (a), lb = _mc_lo64 (b);
  _mc_ti rhi =  _mc_mul64 (ha, hb);
  _mc_ti rm_0 = _mc_mul64 (ha, lb);
  _mc_ti rm_1 = _mc_mul64 (hb, la);
  _mc_ti rlo =  _mc_mul64 (la, lb);
  _mc_ti hi, lo, t;
  uint32_t carry;
  
  t = _mc_add (_mc_lo2hi (rm_0), rlo); carry = _mc_lt (t, rlo);
  lo = _mc_add (_mc_lo2hi (rm_1), t); carry += _mc_lt (lo, t);
  hi = _mc_add (_mc_add (_mc_add (rhi, _mc_hi2lo (rm_0)), _mc_hi2lo (rm_1)),
		_mc_const (0, carry));
  return _mc_add (hi, lo);
}

static inline _mc_ti _mc_2le (_mc_ti v) {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  return v;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  return _mc_swap (v);
#else
#error "Unknown endianess"
#endif
}

static inline _mc_ti _mc_get (const uint64_t a[2]) {return _mc_const (a[0], a[1]);}

/* Here are different primes randomly generated with the equal
   probability of their bit values.  They are used to randomize input
   values.  */
static const uint64_t _mc_block_start_primes[4][2] = {
  {0Xcdc037073edf8fd4ULL, 0X9e2a1f46e21930f3ULL},
  {0X128281cfcc981531ULL, 0Xae5a66b7a27ba90fULL},
  {0X676f0ed409f0019aULL, 0X4bd8035682ec0095ULL},
  {0X6fae98c0d49ed3ffULL, 0X5268b2b43337690bULL},
};
static const uint64_t _mc_unroll_primes[4][2] = {
  {0X5cc2a100bf7b7c05ULL, 0Xd5ce2499d1844187ULL},
  {0X2fdc8fe047488edaULL, 0Xd4c8739663e296e9ULL},
  {0Xd36d0a4431717b61ULL, 0X10cb847ca52a3e25ULL},
  {0Xf011560af9264d79ULL, 0Xc9d48bb40afed5d3ULL},
};
static const uint64_t _mc_tail_primes[4][2] = {
  {0X270957969935d145ULL, 0Xcb75144659dbd40dULL},
  {0X12db17d4be72d52cULL, 0X3b0cbf3954138dfdULL},
  {0X5022d90d6ccc2372ULL, 0X889add17a2618f99ULL},
  {0Xe282c1104925025bULL, 0Xfdf6cd06e47a8c85ULL},
};
static const uint64_t _mc_primes [16][2] = {
  {0Xfc35b01709510063ULL, 0Xec2fdbeaade88605ULL},
  {0Xd2b356ae579f47a6ULL, 0X65862b62bdf5ef4dULL},
  {0X288eea216831e6a7ULL, 0Xa79128dc46b0173fULL},
  {0Xb737b72062479c04ULL, 0X5f6c924506e59f99ULL},
  {0X92a4d7c15a574735ULL, 0X5d1d4782c102eadfULL},
  {0Xf9c1acb5076cf7c7ULL, 0Xb9ecfe43117ae249ULL},
  {0X10ee204d93f26f50ULL, 0Xb70b8f530f15aa6bULL},
  {0X181da242743a22dcULL, 0Xff14e8e58f6e1721ULL},
  {0X3e431aaaac0f869eULL, 0X9feebb1f9c386fe3ULL},
  {0X817292a324fa17b0ULL, 0X2c37dcc844bc3d37ULL},
  {0Xcdf95a3e7c6fcb0dULL, 0Xcd2aa0073fbeb7b9ULL},
  {0Xfc67b9a71847832aULL, 0X640c8f5ed3777417ULL},
  {0X6cb4dcd475f71ae5ULL, 0X51f0328a65fc65bbULL},
  {0Xdcab58b43c79dee4ULL, 0X745bd64098a26981ULL},
  {0X1a8f800285c50091ULL, 0X2afffc9213aa8a5fULL},
    {0X532e0082221804dbULL, 0X20c941f9b512df11ULL},
};

static const uint64_t _mc_finish_primes [64][2] = {
  {0X1e0055fb724474e8ULL, 0Xff29de49c378e459ULL},
  {0X83e8be07b5531ec3ULL, 0X4266dedbf60f5523ULL},
  {0X25d03dacd0a5c8e5ULL, 0Xdbbcac5c7cda883fULL},
  {0X9984f048ee48645eULL, 0X12c928afdf625a47ULL},
  {0X8100dab7b00755d3ULL, 0X12a2f064b6def9fULL},
  {0X2a4936c9ef78297cULL, 0Xa9c902ba35037359ULL},
  {0X833ac9a368619006ULL, 0X16fc020d9f40323ULL},
  {0X56f880bf9ae924acULL, 0X87262cbecf623dc5ULL},
  {0Xf92080d63124cc29ULL, 0X94da893165805cfbULL},
  {0X852b1ea7d206adfaULL, 0Xded5f478baf4f65bULL},
  {0X6c090f99eb7da834ULL, 0X8619a705f355b9d1ULL},
  {0Xc24556cc93aff883ULL, 0X5df87f6b5076a2cfULL},
  {0X24fa93e0575eb1f6ULL, 0X3d7063f15281ff03ULL},
  {0X47ee8f94f08cb813ULL, 0X8128602ec25d6a19ULL},
  {0X3625bd5ebdd10f45ULL, 0Xe43054d209066c7bULL},
  {0X120db6d8b3382a5ULL, 0X314fb5d22fced3f9ULL},
  {0X78276dc95fb85e7dULL, 0X57bfc5460f873c9dULL},
  {0X96d099814184b5b0ULL, 0Xd89f836c35b9b0c9ULL},
  {0X6d79572ccf590f33ULL, 0Xab8030fdeffd48f5ULL},
  {0X19c5d2c51e451eddULL, 0Xc675c5e74f1c2c45ULL},
  {0X63b09e2290211695ULL, 0X494b134c5d324f3ULL},
  {0Xed2cda11f978bb7dULL, 0X827e3d4edf1475e1ULL},
  {0X9a5e00a94de13a14ULL, 0X55667b0f406987abULL},
  {0Xcbcabf81b1338e26ULL, 0X2dd53df2ae66d191ULL},
  {0X13c880d6b45a6c9cULL, 0Xe0a28ed2c8049f5fULL},
  {0X55279312bf2ed6feULL, 0Xc6c19c752e4bcb2dULL},
  {0Xf138de98d83585caULL, 0Xc90ad41770782dd1ULL},
  {0X81f28fded1813f57ULL, 0X94900f7877c93a37ULL},
  {0X473457080f9406d2ULL, 0Xc5989b97426594afULL},
  {0Xe5799a1363835a22ULL, 0Xce4ca5f532d3980bULL},
  {0Xfd7b3c358f596498ULL, 0X70fd7270db8949b5ULL},
  {0Xfd6a0c5c43a6c653ULL, 0Xeb3aa98686ffa645ULL},
  {0X3df53625c3afc40aULL, 0Xdf3bffec06a23541ULL},
  {0Xef491e13aa526595ULL, 0Xf6688ada28d7e81fULL},
  {0X8419c0511b7e4242ULL, 0X7ba8ddb66b20a5d7ULL},
  {0X11b3da4f4f90fb72ULL, 0X71de3984d3c23847ULL},
  {0X6616fb20c0231403ULL, 0Xfa06db687a1bc2ffULL},
  {0X6e661bbd3bd6e950ULL, 0X6e45cbd289a77c23ULL},
  {0X599d1220a03da949ULL, 0Xff568091f9c59349ULL},
  {0X4464d21d20da203bULL, 0X352006ad6fb6de5ULL},
  {0X6413c7b09fc3dcc8ULL, 0X55fc3da745579871ULL},
  {0X160111a644959396ULL, 0Xd12e059acb048b01ULL},
  {0Xc66e719ac9a96714ULL, 0X832bb4f760cb0c0dULL},
  {0X92b777a80cfd2a59ULL, 0X883dc6537f8d2a6bULL},
  {0Xf40a3463bd3220d7ULL, 0Xadd624b1439d6507ULL},
  {0X2d11c5c5e6c3bf75ULL, 0Xfc027148e7ed66c9ULL},
  {0Xe9437c1b3494cb46ULL, 0Xacd35405f3d2f4b7ULL},
  {0X838f551f688bd046ULL, 0X3e87440f1a4642e9ULL},
  {0Xce20a8f09e985b88ULL, 0Xa98931ac95adde85ULL},
  {0Xe3580702cf5a1c80ULL, 0X8e76e05a566733cdULL},
  {0X15ff369908c7d67dULL, 0X11bc44ed911d2aa7ULL},
  {0Xab2694c16fe59e02ULL, 0Xc1f5ba02d82d5e85ULL},
  {0X7c59a06636a7edddULL, 0X684643ed7e49b619ULL},
  {0X40b6270f7f9fa251ULL, 0X9d8b29eef5b17a8dULL},
  {0X4044ad0f26d7e774ULL, 0Xf3d1fd1ab3ca5037ULL},
  {0X5ec3d199e0063437ULL, 0X12fb529ef8cc3a73ULL},
  {0X6b5f89cb3017e3e2ULL, 0Xfdb9b50566a2626dULL},
  {0X76ad12644718202cULL, 0X967fbb370d71862fULL},
  {0Xb3b7cb5a21f3eaeaULL, 0X18f9fe203ad955ffULL},
  {0Xfd28dc38cbec1a6cULL, 0X8e1920c5a0132445ULL},
  {0X23f9cf9b2158b106ULL, 0Xd9a70361d96b31f9ULL},
  {0X224fa274d6ea69f7ULL, 0X6316130d5d925dc3ULL},
  {0X14b09aa58cb268d5ULL, 0Xf34d372c6f503cc9ULL},
  {0X6289be577a4ae0a6ULL, 0X3ba5692b1eafeb15ULL},
};

/* Macro defining how many times the most nested loop in
   _mc_hash_aligned will be unrolled by the compiler (although it can
   make an own decision:).  Use only a constant here to help a
   compiler to unroll a major loop.  Don't change the value.  Changing
   it changes the hash.  If you increase it you need more numberss in
   _mc_primes.  */
#define _MC_UNROLL_FACTOR 4

static inline void _MC_OPTIMIZE ("unroll-all-loops")
_mc_hash_aligned (_mc_ti state[4], const void *data, size_t len) {
  _mc_ti result[4];
  const unsigned char *str = (const unsigned char *) data;
  union {_mc_ti i; unsigned char s[sizeof (_mc_ti)];} p;
  int j;
  size_t i, n;
  
  for (i = 0; i < 4; i++)
    result[i] = _mc_mum (state[i], _mc_get (_mc_block_start_primes[i]));
  while  (len > _MC_UNROLL_FACTOR * sizeof (_mc_ti)) {
    for (i = 0; i < _MC_UNROLL_FACTOR; i++)
      for (j = 0; j < 4; j++)
	result[j] = _mc_xor (result[j], _mc_mum (_mc_2le (((_mc_ti *) str)[i]),
						 _mc_get (_mc_primes[4 * i + j])));
    /* We might use the same prime numbers on the next iterations --
       randomize the state.  */
    for (i = 0; i < 4; i++)
      result[i] = _mc_mum (result[i], _mc_get (_mc_unroll_primes[i]));
    len -= _MC_UNROLL_FACTOR * sizeof (_mc_ti);
    str += _MC_UNROLL_FACTOR * sizeof (_mc_ti);
  }
  n = len / sizeof (_mc_ti);
  for (i = 0; i < n; i++)
    for (j = 0; j < 4; j++)
      result[j] = _mc_xor (result[j], _mc_mum (_mc_2le (((_mc_ti *) str)[i]),
					       _mc_get (_mc_primes[4 * i + j])));
  len -= n * sizeof (_mc_ti); str += n * sizeof (_mc_ti);
  if (len) {
    p.i = _mc_const (0, 0); memcpy (p.s, str, len);
    for (i = 0; i < 4; i++)
      result[i] = _mc_xor (result[i],
			   _mc_mum (_mc_2le (((_mc_ti *) p.s)[0]),
				    _mc_get (_mc_tail_primes[i])));
  }
  for (i = 0; i < 4; i++)
    state[i] = _mc_xor (state[i], result[i]);
}

#ifndef MUM512_ROUNDS
#define MUM512_ROUNDS 8
#endif

#if MUM512_ROUNDS < 2
#error "too few final rounds"
#elif MUM512_ROUNDS > 16
#error "too many final rounds"
#endif

static inline void
_mc_permute (_mc_ti t[4]) {
  /* Row */
  t[0] = _mc_rotr (_mc_xor (t[0], t[1]), 17);
  t[1] = _mc_rotr (_mc_xor (t[1], t[0]), 33);
  t[3] = _mc_rotr (_mc_xor (t[3], t[2]), 49);
  t[2] = _mc_rotr (_mc_xor (t[2], t[3]), 65);
  /* Column */
  t[0] = _mc_rotr (_mc_xor (t[0], t[2]), 9);
  t[2] = _mc_rotr (_mc_xor (t[2], t[0]), 25);
  t[3] = _mc_rotr (_mc_xor (t[3], t[1]), 41);
  t[1] = _mc_rotr (_mc_xor (t[1], t[3]), 57);
  /* Diagonal */
  t[0] = _mc_rotr (_mc_xor (t[0], t[3]), 13);
  t[3] = _mc_rotr (_mc_xor (t[3], t[0]), 29);
  t[2] = _mc_rotr (_mc_xor (t[2], t[1]), 45);
  t[1] = _mc_rotr (_mc_xor (t[1], t[2]), 61);
}

static inline void
_mc_mix (_mc_ti state[4], uint64_t out[8]) {
  int i, j;
  _mc_ti t[4];
  
  for (i = 0; i < 4; i++)
    t[i] = state[i];
  for (i = 0; i < MUM512_ROUNDS; i++) {
    for (j = 0; j < 4; j++)
      t[j] = _mc_xor (t[j], _mc_mum (t[j], _mc_get (_mc_finish_primes[i * 4 + j])));
    _mc_permute (t);
  }
  for (i = 0; i < 4; i++) {
    t[i] = _mc_2le (t[i]);
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
    out[2 * i] = _mc_lo64 (t[i]);
    out[2 * i + 1] = _mc_hi64 (t[i]);
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
    out[2 * i] = _mc_hi64 (t[i]);
    out[2 * i + 1] = _mc_lo64 (t[i]);
#endif
  }
}

/* Random prime numbers for the initialization.  */
#define SEED_PRIME0 0x2e0bb864e9ea7df5ULL
#define SEED_PRIME1 0xcdb32970830fcaa1ULL
#define SEED_PRIME2 0xc42b5e2e6480b23bULL
#define SEED_PRIME3 0x746addee2093e7e1ULL
#define SEED_PRIME4 0xa9a7ae7ceff79f3fULL
#define SEED_PRIME5 0xaf47d47c99b1461bULL
#define SEED_PRIME6 0x7b51ec3d22f7096fULL

static inline void
_mc_init_state (_mc_ti state[4], const uint64_t seed[4], size_t len) {
  state[0] = _mc_const (seed[0] ^ SEED_PRIME0, seed[1] ^ SEED_PRIME1);
  state[1] = _mc_const (seed[2] ^ SEED_PRIME2, seed[3] ^ SEED_PRIME3);
  state[2] = _mc_const (len, SEED_PRIME4);
  state[3] = _mc_const (SEED_PRIME5, SEED_PRIME6);
}

#ifndef _MC_UNALIGNED_ACCESS
#if defined(__x86_64__) || defined(__i386__) || defined(__PPC64__) \
    || defined(__s390__) || defined(__m32c__) || defined(cris)     \
    || defined(__CR16__) || defined(__vax__) || defined(__m68k__)  \
    || defined(__aarch64__) || defined(_M_AMD64) || defined(_M_IX86)
#define _MC_UNALIGNED_ACCESS 1
#else
#define _MC_UNALIGNED_ACCESS 0
#endif
#endif

/* When we need an aligned access to data being hashed we move part of
   the unaligned data to an aligned block of given size and then
   process it, repeating processing the data by the block.  */
#ifndef _MC_BLOCK_LEN_POW2
#define _MC_BLOCK_LEN_POW2 10
#endif

#if _MC_BLOCK_LEN_POW2 < 4
/* We should process by full 128-bit unless it is a tail.  */
#error "too small block length"
#endif

static inline void
#if defined(__x86_64__)
_MC_TARGET("inline-all-stringops")
#endif
_mc_hash_default (const void *data, size_t len, const uint64_t seed[4], unsigned char *out) {
  _mc_ti state[4];
  const unsigned char *str = (const unsigned char *) data;
  size_t block_len;
  _mc_ti buf[_MC_BLOCK_LEN_POW2 / sizeof (_mc_ti)];
  
  _mc_init_state (state, seed, len);
  if (((size_t) str & 0x7) == 0)
    _mc_hash_aligned (state, data, len);
  else {
    while (len != 0) {
      block_len
	= len < (1 << _MC_BLOCK_LEN_POW2) ? len : (1 << _MC_BLOCK_LEN_POW2);
      memcpy (buf, str, block_len);
      _mc_hash_aligned (state, buf, block_len);
      len -= block_len;
      str += block_len;
    }
  }
  if (((size_t) out & 0x7) == 0)
    _mc_mix (state, (uint64_t *) out);
  else {
    uint64_t ui64[8];
    
    _mc_mix (state, ui64);
    memmove (out, ui64, 64);
  }
}

/* ++++++++++++++++++++++++++ Interface functions: +++++++++++++++++++  */

/* Hash DATA of length LEN and SEED.  */
static inline void
mum512_keyed_hash (const void *data, size_t len, const uint64_t key[4], unsigned char *out) {
#if _MC_UNALIGNED_ACCESS
  _mc_ti state[4];
  
  _mc_init_state (state, key, len);
  _mc_hash_aligned (state, data, len);
  _mc_mix (state, (uint64_t *) out);
  return;
#else
  _mc_hash_default (data, len, key, out);
#endif
}

/* Hash DATA of length LEN.  */
static inline void
mum512_hash (const void *data, size_t len, unsigned char *out) {
  static const uint64_t key[4] = {0, 0, 0, 0};

  mum512_keyed_hash (data, len, key, out);
}

#endif


================================================
FILE: vmum.h
================================================
/* Copyright (c) 2025
   Vladimir Makarov <vmakarov@gcc.gnu.org>

   Permission is hereby granted, free of charge, to any person
   obtaining a copy of this software and associated documentation
   files (the "Software"), to deal in the Software without
   restriction, including without limitation the rights to use, copy,
   modify, merge, publish, distribute, sublicense, and/or sell copies
   of the Software, and to permit persons to whom the Software is
   furnished to do so, subject to the following conditions:

   The above copyright notice and this permission notice shall be
   included in all copies or substantial portions of the Software.

   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   SOFTWARE.
*/

/* This file implements VMUM (Vector MUltiply and Mix) hashing. We randomize
   input data by 64x64-bit multiplication and mixing hi- and low-parts of the
   multiplication result by using an addition and then mix it into the current
   state. We use random numbers generated with sha3sum for the multiplication.
   When all the numbers are used once, the state is randomized and the same
   numbers are used again for data randomization.

   The VMUM hashing passes all rurban SMHasher tests. */

#ifndef __VMUM_HASH__
#define __VMUM_HASH__

#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>

#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
#include <assert.h>
#else
#define static_assert(a)
#endif

#ifdef _MSC_VER
typedef unsigned __int16 uint16_t;
typedef unsigned __int32 uint32_t;
typedef unsigned __int64 uint64_t;
#else
#include <stdint.h>
#endif

#if defined(__GNUC__)
#define _VMUM_ATTRIBUTE_UNUSED __attribute__ ((unused))
#define _VMUM_INLINE inline __attribute__ ((always_inline))
#else
#define _VMUM_ATTRIBUTE_UNUSED
#define _VMUM_INLINE inline
#endif

/* Macro saying to use 128-bit integers implemented by GCC for some targets. */
#ifndef _VMUM_USE_INT128
/* In GCC uint128_t is defined if HOST_BITS_PER_WIDE_INT >= 64. HOST_WIDE_INT
   is long if HOST_BITS_PER_LONG > HOST_BITS_PER_INT, otherwise int. */
#if defined(__GNUC__) && UINT_MAX != ULONG_MAX
#define _VMUM_USE_INT128 1
#else
#define _VMUM_USE_INT128 0
#endif
#endif

/* Here are different random numbers generated with the equal probability of
   their bit values. They are used to randomize input values. */
static uint64_t _vmum_hash_step_factor = 0x7fc65e8a22c2f74bull;
static uint64_t _vmum_key_step_factor = 0x9b307d68270e94e5ull;
static uint64_t _vmum_block_start_factor = 0x6608b54dafbc797cull;
static uint64_t _vmum_tail_factor = 0xc6f58747253b0e84ull;
static uint64_t _vmum_finish_factor1 = 0xbc4bb29ce739b5d9ull;
static uint64_t _vmum_finish_factor2 = 0x7007946aa4fdb987ull;

static uint64_t _vmum_unroll_factors[] = {
  0x191fb5fc4a9bf2deull,
  0xd9a09a0a2c4eb3ebull,
  0x90f15ee96deb1eecull,
  0x1a970df0a79d09baull,
};

static const uint64_t _vmum_zero[] = {0ull, 0ull, 0ull, 0ll};
static uint64_t _vmum_factors[] = {
  0x0c3ec9639d3a203full, 0x898368b5fb060422ull, 0xfe3c08767c1e5068ull, 0x4535ac3cb182d3caull,
  0xba6ba8dcc8a2d978ull, 0x9f1221df37b27ca1ull, 0x57b1b40817cde05eull, 0xadb5c6075e5dd1c3ull,
  0x0f91abf611686bc3ull, 0x72fc850fbe9023f4ull, 0x4922ec730400d7e1ull, 0x7452d927d9970eb2ull,
  0x607ad8c0dbe89ff9ull, 0x463bf8a99a222448ull, 0x6a0c8e5c2171d499ull, 0x88a9af1a70175e0full,

  0x50c07b9351011316ull, 0xce26a1f4e66e3d1full, 0x396d79c90c86b1edull, 0x3739b72e11fc4684ull,
  0x7fe4adc8400af08eull, 0xfa2e47475b11a3ceull, 0xd519635f1d7b9242ull, 0x866d04bd866130fcull,
  0x6132e913fd0ae2c9ull, 0xaeacc8d99a02880dull, 0xf196fbab2ef62dbbull, 0x62a6a4ae6d3f5fddull,
  0x3147dbedb6618cf4ull, 0x18dc2a4e6e46b7f5ull, 0x98ce62d3e346f804ull, 0x3e507b6626b9c9c6ull,

  0x08d2d6aba49b53a7ull, 0x64f897cf54cc0984ull, 0xe6568e3081d4dc31ull, 0x936eff54d90e79bdull,
  0x45ecee4cb55e5529ull, 0xe7729fe4b83bd1f0ull, 0x8ced03855b3a62aeull, 0xc29ee17b601d5ea6ull,
  0xfadd87cd819b8b23ull, 0x5ff6b6787a02da34ull, 0xcc0f997728759ba9ull, 0x06a51956916c45ffull,
  0xd31923282290a41full, 0x9e2fcda93feb36aaull, 0xfdc11827f9bb4b63ull, 0xe11b2c099cf17325ull,

  0xd4efd1919316bb4eull, 0x7749918c7756d897ull, 0x32eff42dc1c4f41eull, 0x4171f5e422bdf4faull,
  0xdd5c7f50a7977e65ull, 0xc52dc937487cc197ull, 0xbff445a1c9b19184ull, 0xb2fd8206a0325cb9ull,
  0xbf32bb70e354a83cull, 0x587313d37681ffc2ull, 0x687c0b98ef7ad731ull, 0x2c09a37608248f89ull,
  0x20c8d9b1b6e28faaull, 0x747008703cbc9483ull, 0x0bdcacce877cb231ull, 0xf740c0ac1dc79a0cull,

  0x8e843baef228089dull, 0xc379d4c3b6e28c1bull, 0xb5d44eee257f1206ull, 0xb5dfee44ef6b05adull,
  0xaab0edc0f566f359ull, 0x21883ede514d45ddull, 0xcdaeea0482c73a46ull, 0xb3751631e5645c4full,
  0xd22ac81254aad1acull, 0x59c0f8f5c93121dfull, 0x3b343a848e3097c3ull, 0x87c65a3a866ed9edull,
  0xf112c880c63ddafdull, 0xff98b9575bfce057ull, 0xd6e8f6ad6f4da8aeull, 0xba0379ba20645a51ull,
};

/* Multiply 64-bit V and P and return sum of high and low parts of the result.
 */
static _VMUM_INLINE uint64_t _vmum (uint64_t v, uint64_t p) {
  uint64_t hi, lo;
#if _VMUM_USE_INT128
  __uint128_t r = (__uint128_t) v * (__uint128_t) p;
  hi = (uint64_t) (r >> 64);
  lo = (uint64_t) r;
#else
  /* Implementation of 64x64->128-bit multiplication by four 32x32->64 bit
   * multiplication. */
  uint64_t hv = v >> 32, hp = p >> 32;
  uint64_t lv = (uint32_t) v, lp = (uint32_t) p;
  uint64_t rh = hv * hp;
  uint64_t rm_0 = hv * lp;
  uint64_t rm_1 = hp * lv;
  uint64_t rl = lv * lp;
  return rh + rl + rm_0 + rm_1;
  uint64_t t, carry = 0;

  /* We could ignore a carry bit here if we did not care about the same hash
     for 32-bit and 64-bit targets. */
  t = rl + (rm_0 << 32);
#ifdef VMUM_TARGET_INDEPENDENT_HASH
  carry = t < rl;
#endif
  lo = t + (rm_1 << 32);
#ifdef VMUM_TARGET_INDEPENDENT_HASH
  carry += lo < t;
#endif
  hi = rh + (rm_0 >> 32) + (rm_1 >> 32) + carry;
#endif
  return hi + lo;
}

#if defined(_MSC_VER)
#define _vmum_bswap32(x) _byteswap_uint32_t (x)
#define _vmum_bswap64(x) _byteswap_uint64_t (x)
#elif defined(__APPLE__)
#include <libkern/OSByteOrder.h>
#define _vmum_bswap32(x) OSSwapInt32 (x)
#define _vmum_bswap64(x) OSSwapInt64 (x)
#elif defined(__GNUC__)
#define _vmum_bswap32(x) __builtin_bswap32 (x)
#define _vmum_bswap64(x) __builtin_bswap64 (x)
#else
#include <byteswap.h>
#define _vmum_bswap32(x) bswap32 (x)
#define _vmum_bswap64(x) bswap64 (x)
#endif

static _VMUM_INLINE uint64_t _vmum_le (uint64_t v) {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(VMUM_TARGET_INDEPENDENT_HASH)
  return v;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  return _vmum_bswap64 (v);
#else
#error "Unknown endianess"
#endif
}

static _VMUM_INLINE uint32_t _vmum_le32 (uint32_t v) {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(VMUM_TARGET_INDEPENDENT_HASH)
  return v;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  return _vmum_bswap32 (v);
#else
#error "Unknown endianess"
#endif
}

static _VMUM_INLINE uint64_t _vmum_le16 (uint16_t v) {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(VMUM_TARGET_INDEPENDENT_HASH)
  return v;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  return (v >> 8) | ((v & 0xff) << 8);
#else
#error "Unknown endianess"
#endif
}

static _VMUM_INLINE uint64_t _vmum_xor (uint64_t a, uint64_t b) {
#ifdef VMUM_V1
  return a ^ b;
#else
  return (a ^ b) != 0 ? a ^ b : b;
#endif
}

static _VMUM_INLINE uint64_t _vmum_plus (uint64_t a, uint64_t b) {
#ifdef VMUM_V1
  return a + b;
#else
  return a + b != 0 ? a + b : b;
#endif
}

/* Here is the vector code for different targets.  We process blocks of 256-bit
   length. Vector consists of 64-bit elements.  The vector block update uses
   32-bit x 32-bit usigned multiplication to 64-bit result. The understand what
   the update implements see scalar version below. */
#if defined(__AVX2__)
typedef long long __attribute__ ((vector_size (32), aligned (1))) _vmum_block_t;
typedef int __attribute__ ((vector_size (32), aligned (1))) _vmum_v8si;
static _VMUM_INLINE _vmum_block_t _vmum_block (_vmum_block_t v, _vmum_block_t p) {
  _vmum_block_t hv = v >> 32, hp = p >> 32;
  _vmum_block_t r
    = (_vmum_block_t) (__builtin_ia32_pmuludq256 ((_vmum_v8si) v, (_vmum_v8si) p)
                       + __builtin_ia32_pmuludq256 ((_vmum_v8si) hv, (_vmum_v8si) hp));
  return r;
}
static _VMUM_INLINE _vmum_block_t _vmum_nonzero (_vmum_block_t v) {
  _vmum_block_t r
    = (_vmum_block_t) __builtin_ia32_pcmpeqd256 ((_vmum_v8si) v, *(_vmum_v8si *) _vmum_zero);
  return v | r;
}
static _VMUM_INLINE void _vmum_update_block (_vmum_block_t *s, const _vmum_block_t *v,
                                             const _vmum_block_t *p) {
#ifdef VMUM_V1
  *s ^= _vmum_block (v[0] ^ p[0], v[1] ^ p[1]);
#else
  *s ^= _vmum_block (_vmum_nonzero (v[0] ^ p[0]), _vmum_nonzero (v[1] ^ p[1]));
#endif
}
static _VMUM_INLINE void _vmum_factor_block (_vmum_block_t *s, const _vmum_block_t *p) {
  *s = _vmum_block (*s, *p);
}
static _VMUM_INLINE void _vmum_zero_block (_vmum_block_t *b) { *b = (_vmum_block_t) {0, 0, 0, 0}; }
static _VMUM_INLINE uint64_t _vmum_fold_block (const _vmum_block_t *b) {
  return (*b)[0] ^ (*b)[1] ^ (*b)[2] ^ (*b)[3];
}
#elif defined(__ARM_NEON)
#include <arm_neon.h>
typedef struct {
  uint64x2_t v[2];
} _vmum_block_t;
static _VMUM_INLINE uint64x2_t _vmum_val (uint64x2_t v, uint64x2_t p) {
  uint32x4_t v32 = (uint32x4_t) v, p32 = (uint32x4_t) p;
  uint64x2_t r = vmull_u32 (vget_low_u32 (v32), vget_low_u32 (p32)), r2 = vmull_high_u32 (v32, p32);
  return vpaddq_u64 (r, r2);
}
static _VMUM_INLINE uint64x2_t _vmum_nonzero (uint64x2_t v) {
  uint64x2_t r = vceqzq_u64 (v);
  return r | v;
}
static _VMUM_INLINE void _vmum_update_block (_vmum_block_t *s, const _vmum_block_t *v,
                                             const _vmum_block_t *p) {
  const _vmum_block_t *v2 = &v[1], *p2 = &p[1];
#ifdef VMUM_V1
  s->v[0] ^= _vmum_val (v->v[0] ^ p->v[0], v2->v[0] ^ p2->v[0]);
  s->v[1] ^= _vmum_val (v->v[1] ^ p->v[1], v2->v[1] ^ p2->v[1]);
#else
  s->v[0] ^= _vmum_val (_vmum_nonzero (v->v[0] ^ p->v[0]), _vmum_nonzero (v2->v[0] ^ p2->v[0]));
  s->v[1] ^= _vmum_val (_vmum_nonzero (v->v[1] ^ p->v[1]), _vmum_nonzero (v2->v[1] ^ p2->v[1]));
#endif
}
static _VMUM_INLINE void _vmum_factor_block (_vmum_block_t *s, const _vmum_block_t *p) {
  s->v[0] = _vmum_val (s->v[0], p->v[0]);
  s->v[1] = _vmum_val (s->v[1], p->v[1]);
}
static _VMUM_INLINE void _vmum_zero_block (_vmum_block_t *b) { *b = (_vmum_block_t) {0, 0, 0, 0}; }
static _VMUM_INLINE uint64_t _vmum_fold_block (_vmum_block_t *b) {
  return b->v[0][0] ^ b->v[0][1] ^ b->v[1][0] ^ b->v[1][1];
}
#elif defined(__ALTIVEC__)
#include <altivec.h>
typedef unsigned long long __attribute__ ((vector_size (16))) _vmum_v2di;
typedef struct {
  _vmum_v2di v[2];
} _vmum_block_t;
static _VMUM_INLINE _vmum_v2di _vmum_val (_vmum_v2di v, _vmum_v2di p) {
  typedef unsigned int __attribute__ ((vector_size (16))) v4si;
  return (_vmum_v2di) (vec_mule ((v4si) v, (v4si) p) + vec_mulo ((v4si) v, (v4si) p));
}
static _VMUM_INLINE void _vmum_zero_block (_vmum_block_t *b) { *b = (_vmum_block_t) {0, 0, 0, 0}; }
static _VMUM_INLINE _vmum_v2di _vmum_nonzero (_vmum_v2di v) {
  __vector __bool long mask = vec_cmpne (v, (_vmum_v2di) {0ull, 0ull});
  _vmum_v2di r = vec_sel ((_vmum_v2di) {0xffffffffffffffffull, 0xffffffffffffffffull}, v, mask);
  return r;
}
static _VMUM_INLINE void _vmum_update_block (_vmum_block_t *s, const _vmum_block_t *v,
                                             const _vmum_block_t *p) {
  const _vmum_block_t *v2 = &v[1], *p2 = &p[1];
#ifdef VMUM_V1
  s->v[0] ^= _vmum_val (v->v[0] ^ p->v[0], v2->v[0] ^ p2->v[0]);
  s->v[1] ^= _vmum_val (v->v[1] ^ p->v[1], v2->v[1] ^ p2->v[1]);
#else
  s->v[0] ^= _vmum_val (_vmum_nonzero (v->v[0] ^ p->v[0]), _vmum_nonzero (v2->v[0] ^ p2->v[0]));
  s->v[1] ^= _vmum_val (_vmum_nonzero (v->v[1] ^ p->v[1]), _vmum_nonzero (v2->v[1] ^ p2->v[1]));
#endif
}
static _VMUM_INLINE void _vmum_factor_block (_vmum_block_t *s, const _vmum_block_t *p) {
  s->v[0] = _vmum_val (s->v[0], p->v[0]);
  s->v[1] = _vmum_val (s->v[1], p->v[1]);
}
static _VMUM_INLINE uint64_t _vmum_fold_block (_vmum_block_t *b) {
  return b->v[0][0] ^ b->v[0][1] ^ b->v[1][0] ^ b->v[1][1];
}
#else /* scalar version: */
typedef struct {
  uint64_t v[4];
} _vmum_block_t;
static _VMUM_INLINE uint64_t _vmum_val (uint64_t v, uint64_t p) {
  uint64_t lv = (uint32_t) v, lp = (uint32_t) p, hv = v >> 32, hp = p >> 32;
  return lv * lp + hv * hp;
}
static _VMUM_INLINE void _vmum_update_block (_vmum_block_t *s, const _vmum_block_t *v,
                                             const _vmum_block_t *p) {
  const _vmum_block_t *v2 = &v[1], *p2 = &p[1];
  s->v[0] ^= _vmum_val (_vmum_xor (_vmum_le (v->v[0]), p->v[0]),
                        _vmum_xor (_vmum_le (v2->v[0]), p2->v[0]));
  s->v[1] ^= _vmum_val (_vmum_xor (_vmum_le (v->v[1]), p->v[1]),
                        _vmum_xor (_vmum_le (v2->v[1]), p2->v[1]));
  s->v[2] ^= _vmum_val (_vmum_xor (_vmum_le (v->v[2]), p->v[2]),
                        _vmum_xor (_vmum_le (v2->v[2]), p2->v[2]));
  s->v[3] ^= _vmum_val (_vmum_xor (_vmum_le (v->v[3]), p->v[3]),
                        _vmum_xor (_vmum_le (v2->v[3]), p2->v[3]));
}
static _VMUM_INLINE void _vmum_factor_block (_vmum_block_t *s, const _vmum_block_t *p) {
  s->v[0] = _vmum_val (s->v[0], p->v[0]);
  s->v[1] = _vmum_val (s->v[1], p->v[1]);
  s->v[2] = _vmum_val (s->v[2], p->v[2]);
  s->v[3] = _vmum_val (s->v[3], p->v[3]);
}
static _VMUM_INLINE void _vmum_zero_block (_vmum_block_t *b) { *b = (_vmum_block_t) {0, 0, 0, 0}; }
static _VMUM_INLINE uint64_t _vmum_fold_block (_vmum_block_t *b) {
  return b->v[0] ^ b->v[1] ^ b->v[2] ^ b->v[3];
}
#endif

/* Macro defining how many vectors the most the 1st nested loop in
   _vmum_hash_aligned will be unrolled by the compiler (although it can make an
   own decision:). Use only a constant here to help a compiler to unroll a
   major loop. The unroll factor greatly affects the hashing speed. We prefer
   the speed. */
#define _VMUM_UNROLL_BLOCK_FACTOR 16
/* Macro defining how many uint64 the most the 2st nested loop in
   _vmum_hash_aligned will be unrolled by the compiler. */
#define _VMUM_UNROLL_FACTOR 16

static _VMUM_INLINE uint64_t
#if defined(__GNUC__) && !defined(__clang__)
  __attribute__ ((__optimize__ ("unroll-loops")))
#endif
  _vmum_hash_aligned (uint64_t start, const void *key, size_t len) {
  uint64_t state = start;
  const unsigned char *str = (const unsigned char *) key;
  uint64_t u64, w;
  size_t i, j;
  size_t n;

  state = _vmum (state, _vmum_block_start_factor);
  if (len >= _VMUM_UNROLL_BLOCK_FACTOR * sizeof (_vmum_block_t)) {
    _vmum_block_t block_state;
    _vmum_zero_block (&block_state);
    do {
      static_assert (_VMUM_UNROLL_BLOCK_FACTOR <= sizeof (_vmum_factors) / sizeof (uint64_t));
      for (i = 0; i < _VMUM_UNROLL_BLOCK_FACTOR; i += 2)
        _vmum_update_block (&block_state, &((_vmum_block_t *) str)[i],
                            &((_vmum_block_t *) _vmum_factors)[i]);
      len -= _VMUM_UNROLL_BLOCK_FACTOR * sizeof (_vmum_block_t);
      str += _VMUM_UNROLL_BLOCK_FACTOR * sizeof (_vmum_block_t);
      /* We will use the same factor numbers on the next iterations --
       * randomize the state. */
      _vmum_factor_block (&block_state, (_vmum_block_t *) _vmum_unroll_factors);
    } while (len >= _VMUM_UNROLL_BLOCK_FACTOR * sizeof (_vmum_block_t));
    state += _vmum_fold_block (&block_state);
  }
  /* Here we have enough factors to do hashing w/o state randominzation: */
  static_assert (_VMUM_UNROLL_BLOCK_FACTOR * sizeof (_vmum_block_t) / sizeof (uint64_t) + 7
                 <= sizeof (_vmum_factors) / sizeof (uint64_t));
  i = 0;
  while (len >= _VMUM_UNROLL_FACTOR * sizeof (uint64_t)) {
    for (j = 0; j < _VMUM_UNROLL_FACTOR; j += 2, i += 2)
      state ^= _vmum (_vmum_xor (_vmum_le (((uint64_t *) str)[i]), _vmum_factors[i]),
                      _vmum_xor (_vmum_le (((uint64_t *) str)[i + 1]), _vmum_factors[i + 1]));
    len -= _VMUM_UNROLL_FACTOR * sizeof (uint64_t);
  }
  n = len / sizeof (uint64_t) & ~(size_t) 1;
  for (j = 0; j < n; j += 2, i += 2)
    state ^= _vmum (_vmum_plus (_vmum_le (((uint64_t *) str)[i]), _vmum_factors[i]),
                    _vmum_plus (_vmum_le (((uint64_t *) str)[i + 1]), _vmum_factors[i + 1]));
  len -= n * sizeof (uint64_t);
  str += i * sizeof (uint64_t);
  uint64_t p;
  switch (len) {
  case 15:
    w = _vmum_plus (_vmum_le (*(uint64_t *) str), _vmum_factors[i]);
    u64 = _vmum_factors[i + 1] + _vmum_le32 (*(uint32_t *) (str + 8));
    u64 += (_vmum_le16 (*(uint16_t *) (str + 12)) << 32) + ((uint64_t) str[14] << 48);
    return state ^ _vmum (u64 ^ _vmum_tail_factor, w);
  case 14:
    w = _vmum_plus (_vmum_le (*(uint64_t *) str), _vmum_factors[i]);
    u64 = _vmum_factors[i + 2] + _vmum_le32 (*(uint32_t *) (str + 8));
    u64 += _vmum_le16 (*(uint16_t *) (str + 12)) << 32;
    return state ^ _vmum (u64 ^ _vmum_tail_factor, w);
  case 13:
    w = _vmum_plus (_vmum_le (*(uint64_t *) str), _vmum_factors[i]);
    u64 = _vmum_factors[i + 3] + _vmum_le32 (*(uint32_t *) (str + 8));
    u64 += (uint64_t) str[12] << 32;
    return state ^ _vmum (u64 ^ _vmum_tail_factor, w);
  case 12:
    w = _vmum_plus (_vmum_le (*(uint64_t *) str), _vmum_factors[i]);
    u64 = _vmum_factors[i + 4] + _vmum_le32 (*(uint32_t *) (str + 8));
    return state ^ _vmum (u64 ^ _vmum_tail_factor, w);
  case 11:
    w = _vmum_plus (_vmum_le (*(uint64_t *) str), _vmum_factors[i]);
    u64 = _vmum_factors[i + 5] + _vmum_le16 (*(uint16_t *) (str + 8));
    u64 += (uint64_t) str[10] << 16;
    return state ^ _vmum (u64 ^ _vmum_tail_factor, w);
  case 10:
    w = _vmum_plus (_vmum_le (*(uint64_t *) str), _vmum_factors[i]);
    u64 = _vmum_factors[i + 6] + _vmum_le16 (*(uint16_t *) (str + 8));
    return state ^ _vmum (u64 ^ _vmum_tail_factor, w);
  case 9:
    w = _vmum_plus (_vmum_le (*(uint64_t *) str), _vmum_factors[i]);
    u64 = _vmum_factors[i + 7] + str[8];
    return state ^ _vmum (u64 ^ _vmum_tail_factor, w);
  case 8: return state ^ _vmum (_vmum_le (*(uint64_t *) str) + _vmum_factors[i], _vmum_factors[i]);
  case 7:
    u64 = _vmum_factors[i + 1] + _vmum_le32 (*(uint32_t *) str);
    u64 += (_vmum_le16 (*(uint16_t *) (str + 4)) << 32) + ((uint64_t) str[6] << 48);
    return state ^ _vmum (u64, _vmum_tail_factor);
  case 6:
    u64 = _vmum_factors[i + 2] + _vmum_le32 (*(uint32_t *) str);
    u64 += _vmum_le16 (*(uint16_t *) (str + 4)) << 32;
    return state ^ _vmum (u64, _vmum_tail_factor);
  case 5:
    u64 = _vmum_factors[i + 3] + _vmum_le32 (*(uint32_t *) str);
    u64 += (uint64_t) str[4] << 32;
    return state ^ _vmum (u64, _vmum_tail_factor);
  case 4:
    u64 = _vmum_factors[i + 4] + _vmum_le32 (*(uint32_t *) str);
    return state ^ _vmum (u64, _vmum_tail_factor);
  case 3:
    u64 = _vmum_factors[i + 5] + _vmum_le16 (*(uint16_t *) str);
    u64 += (uint64_t) str[2] << 16;
    return state ^ _vmum (u64, _vmum_tail_factor);
  case 2:
    u64 = _vmum_factors[i + 6] + _vmum_le16 (*(uint16_t *) str);
    return state ^ _vmum (u64, _vmum_tail_factor);
  case 1: u64 = _vmum_factors[i + 7] + str[0]; return state ^ _vmum (u64, _vmum_tail_factor);
  }
  return state;
}

/* Final randomization of H. */
static _VMUM_INLINE uint64_t _vmum_final (uint64_t h) {
  h = _vmum (h, h);
  return h;
}

#ifndef _VMUM_UNALIGNED_ACCESS
#if defined(__x86_64__) || defined(__i386__) || defined(__PPC64__) || defined(__s390__) \
  || defined(__m32c__) || defined(cris) || defined(__CR16__) || defined(__vax__)        \
  || defined(__m68k__) || defined(__aarch64__) || defined(_M_AMD64) || defined(_M_IX86)
#define _VMUM_UNALIGNED_ACCESS 1
#else
#define _VMUM_UNALIGNED_ACCESS 0
#endif
#endif

/* When we need an aligned access to data being hashed we move part of the
   unaligned data to an aligned block of given size and then process it,
   repeating processing the data by the block. */
#ifndef _VMUM_BLOCK_LEN
#define _VMUM_BLOCK_LEN 1024
#endif

#if _VMUM_BLOCK_LEN < 8
#error "too small block length"
#endif

static _VMUM_INLINE uint64_t
#if defined(__x86_64__) && defined(__GNUC__) && !defined(__clang__)
  __attribute__ ((__target__ ("inline-all-stringops")))
#endif
  _vmum_hash_default (const void *key, size_t len, uint64_t seed) {
  uint64_t result;
  const unsigned char *str = (const unsigned char *) key;
  size_t block_len;
  uint64_t buf[_VMUM_BLOCK_LEN / sizeof (uint64_t)];

  result = seed + len;
  if (((size_t) str & 0x7) == 0)
    result = _vmum_hash_aligned (result, key, len);
  else {
    while (len != 0) {
      block_len = len < _VMUM_BLOCK_LEN ? len : _VMUM_BLOCK_LEN;
      memmove (buf, str, block_len);
      result = _vmum_hash_aligned (result, buf, block_len);
      len -= block_len;
      str += block_len;
    }
  }
  return _vmum_final (result);
}

static _VMUM_INLINE uint64_t _vmum_next_factor (void) {
  uint64_t start = 0;
  int i;

  for (i = 0; i < 8; i++) start = (start << 8) | rand () % 256;
  return start;
}

/* ++++++++++++++++++++++++++ Interface functions: +++++++++++++++++++  */

/* Set random multiplicators depending on SEED. */
static _VMUM_INLINE void vmum_hash_randomize (uint64_t seed) {
  size_t i;

  srand (seed);
  _vmum_hash_step_factor = _vmum_next_factor ();
  _vmum_key_step_factor = _vmum_next_factor ();
  _vmum_finish_factor1 = _vmum_next_factor ();
  _vmum_finish_factor2 = _vmum_next_factor ();
  _vmum_block_start_factor = _vmum_next_factor ();
  _vmum_tail_factor = _vmum_next_factor ();
  for (i = 0; i < sizeof (_vmum_unroll_factors) / sizeof (uint64_t); i++)
    _vmum_unroll_factors[i] = _vmum_next_factor ();
  for (i = 0; i < sizeof (_vmum_factors) / sizeof (uint64_t); i++)
    _vmum_factors[i] = _vmum_next_factor ();
}

/* Start hashing data with SEED. Return the state. */
static _VMUM_INLINE uint64_t vmum_hash_init (uint64_t seed) { return seed; }

/* Process data KEY with the state H and return the updated state. */
static _VMUM_INLINE uint64_t vmum_hash_step (uint64_t h, uint64_t key) {
  return _vmum (h, _vmum_hash_step_factor) ^ _vmum (key, _vmum_key_step_factor);
}

/* Return the result of hashing using the current state H. */
static _VMUM_INLINE uint64_t vmum_hash_finish (uint64_t h) { return _vmum_final (h); }

/* Fast hashing of KEY with SEED. The hash is always the same for the same key
 * on any target. */
static _VMUM_INLINE size_t vmum_hash64 (uint64_t key, uint64_t seed) {
  return vmum_hash_finish (vmum_hash_step (vmum_hash_init (seed + 8), key));
}

/* Hash data KEY of length LEN and SEED. The hash depends on the target
   endianess and the unroll factor. */
static _VMUM_INLINE uint64_t vmum_hash (const void *key, size_t len, uint64_t seed) {
#if _VMUM_UNALIGNED_ACCESS
  return _vmum_final (_vmum_hash_aligned (seed + len, key, len));
#else
  return _vmum_hash_default (key, len, seed);
#endif
}

#endif